#!/usr/bin/env python

"""
Small utility that parses Netscape bookmarks.
"""
# 11-6-2002 T. B. Passin.  Uses decoder wrapper for
#   output stream writer to convert to unicode.
#   Note that this required a coordinated change to
#   bookmarks.py so that the encoding can be passed
#   as a parameter.
# 12-1-2002 T. B. Passin.   Now unescapes "&" in attribute values.

# TODO:
# LAST_CHARSET: put in <info><metadata owner="mozilla"><last_charset>...</info>
# H3:LAST_MODIFIED: Put in XBEL 1.2?
# Cross references?
# Descriptions


from xml.sax import sax2exts,handler
import bookmark
import string, htmlentitydefs

# 12-1-2002 T. B. Passin.  Copied from msie_parse.
def unescape(data,entities={}):
    '''Unescape &amp;. This is needed mainly because sometimes
    URL strings are received with "&" escaped and sometimes they are not.

    This function is intended mainly to fix up URLs that may
    have undesired "&amp;" entities.  The "entities" dictionary
    is provided only for possible future use.'''

    data = data.replace('&amp;','&')
    return data

# --- SAX handler for Netscape bookmarks

class NetscapeHandler(handler.ContentHandler):

    def __init__(self):
        self.bms=bookmark.Bookmarks()
        self.cur_elem = None
        self.added    = None
        self.href     = None
        self.visited  = None
        self.modified = None
        self.latest   = None
        self.desc = ""

    def startElement(self,name,attrs):
        name = string.lower( name )
        d = {}
        for key, value in attrs.items():
            d[ string.lower(key) ] = unescape(value) 
##        print 'start', name, d
        if name=="h3":
            self.cur_elem="h3"
            if d.has_key("folded"):
                self.folded = "yes"
            else:
                self.folded = "no"
            self.id = d.get('id')
            self.added= d.get('add_date',"")
            self.modified = d.get('last_modified', "")
            folder = self.bms.add_folder('', None)
            folder.id = self.id
            folder.folded = self.folded
            folder.added = self.added
            self.latest = folder
        elif name=="a":
            self.cur_elem="a"
            self.bookmark = ""
            if d.has_key('add_date'): self.added=d["add_date"]
            else: self.added = None
            if d.has_key('last_visit'): self.visited=d["last_visit"]
            else: self.visited = None
            if d.has_key('last_modified'): self.modified=d["last_modified"]
            else: self.modified = None

            self.url=d["href"]
        elif name=='title':
            self.cur_elem = 'title'
            self.bms.title = ""
        elif name=='h1':
            self.cur_elem = 'h1'
            self.bms.desc = ""
        elif name=='hr':
            self.bms.add_separator()
        elif name=='meta':
            if d.has_key('http-equiv') and \
               string.lower(d['http-equiv'])=='content-type':
                value = string.split(d['content'], "charset=")
                if len(value) == 2:
                    the_parser.setProperty(handler.property_encoding, value[1])
        elif name in ('dt','dl'):
            if self.desc and not self.latest.desc:
                self.latest.desc = self.desc
            self.desc = ""
            self.curr_elem = ''
        elif name=='dd':
            self.cur_elem = 'dd'
            self.desc = ""


    def characters(self,data):
##        print 'char', self.cur_elem, data
        if self.cur_elem=="h3":
            self.latest.title = self.latest.title + data
        elif self.cur_elem=="a":
            self.bookmark = self.bookmark+data
        elif self.cur_elem=="title":
            self.bms.title = self.bms.title + data
        elif self.cur_elem=="h1":
            self.bms.desc = self.bms.desc + data
        elif self.cur_elem=="dd":
            self.desc = self.desc + data

    def skippedEntity(self, name):
        self.characters(htmlentitydefs.entitydefs[name])

    def endElement(self,name):
        name = string.lower( name )
##        print 'end', name
        if name=="a":
            self.latest = self.bms.add_bookmark(self.bookmark,
                                                added = self.added,
                                                visited = self.visited,
                                                modified = self.modified,
                                                href = self.url)
        elif name=="h3":
            self.cur_elem=None
        elif name=="dl":
            self.bms.leave_folder()
        elif name == self.cur_elem:
            self.cur_elem=None
            
    def endDocument(self):
        if self.desc and not self.latest.desc:
            self.latest.desc = self.desc
        
# --- Test-program

if __name__ == '__main__':
    import sys

    if len(sys.argv)<2 or len(sys.argv)>3:
        print
        print "A simple utility to convert Netscape bookmarks to XBEL."
        print
        print "Usage: "
        print "  ns_parse.py <ns-file> [<xbel-file>]"
        sys.exit(1)

    #import types
    import codecs

    ENCODING = 'utf-8'
    (encoder,decoder,reader,writer) = codecs.lookup(ENCODING)

    ns_handler=NetscapeHandler()
    the_parser = sax2exts.SGMLParserFactory.make_parser()
    the_parser.setContentHandler(ns_handler)
    # For Netscape 4, default to Latin-1
    the_parser.setProperty(handler.property_encoding, ENCODING)
    file = open(sys.argv[1], 'r')
    the_parser.parse(file)
    bms = ns_handler.bms

    if len(sys.argv)==3:
        out=writer(open(sys.argv[2],"w"))
        bms.dump_xbel(out,ENCODING)
        out.close()
    else:
        out = writer(sys.stdout)
        bms.dump_xbel(out,ENCODING)

    # Done

##     ns_handler=NetscapeHandler()

##     p=saxexts.SGMLParserFactory.make_parser()
##     p.setDocumentHandler(ns_handler)
##     p.parseFile(open(r"/home/amk/.netscape/bookmarks.html"))
##     ns_handler.bms.dump_xbel()
