import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape
from bs4 import BeautifulSoup
from time import mktime
import feedparser
import abc
import unicodedata
import re
import dateutil
from dateutil.parser import *
from datetime import *
import codecs

import difflib

from StringIO import StringIO

from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams


class GenericDisclogScraper(object):
    __metaclass__ = abc.ABCMeta
    agencyID = None
    disclogURL = None

    def remove_control_chars(self, input):
        return "".join([i for i in input if ord(i) in range(32, 127)])

    def getAgencyID(self):
        """ disclosr agency id """
        if self.agencyID is None:
            self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
        return self.agencyID

    def getURL(self):
        """ disclog URL"""
        if self.disclogURL is None:
            agency = scrape.agencydb.get(self.getAgencyID())
            self.disclogURL = agency['FOIDocumentsURL']
        return self.disclogURL

    @abc.abstractmethod
    def doScrape(self):
        """ do the scraping """
        return


class GenericHTMLDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        content = rcontent
        dochash = scrape.mkhash(content)
        doc = foidocsdb.get(dochash)
        if doc is None:
            print "saving " + dochash
            description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
            last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
            if last_attach != None:
                html_diff = difflib.HtmlDiff()
                diff = html_diff.make_table(last_attach.read().split('\n'),
                    content.split('\n'))
            edate = date.today().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
            , 'url': self.getURL(), 'docID': dochash,
            "date": edate, "title": "Disclosure Log Updated", "description":  self.remove_control_chars(description), "diff": diff}
            foidocsdb.save(doc)
        else:
            print "already saved"


class GenericPDFDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        laparams = LAParams()
        rsrcmgr = PDFResourceManager(caching=True)
        outfp = StringIO()
        device = TextConverter(rsrcmgr, outfp, codec='utf-8',
            laparams=laparams)
        fp = StringIO()
        fp.write(content)

        process_pdf(rsrcmgr, device, fp, set(), caching=True,
            check_extractable=True)
        description = outfp.getvalue()
        fp.close()
        device.close()
        outfp.close()
        dochash = scrape.mkhash(description)
        doc = foidocsdb.get(dochash)
        if doc is None:
            print "saving " + dochash
            edate = date.today().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
            , 'url': self.getURL(), 'docID': dochash,
            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
            foidocsdb.save(doc)
        else:
            print "already saved"


class GenericDOCXDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
            , self.getURL(), "foidocuments", self.getAgencyID())
        mydoc = zipfile.ZipFile(file)
        xmlcontent = mydoc.read('word/document.xml')
        document = etree.fromstring(xmlcontent)
        ## Fetch all the text out of the document we just created
        paratextlist = getdocumenttext(document)
        # Make explicit unicode version
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
            ## Print our documnts test with two newlines under each paragraph
        description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
        dochash = scrape.mkhash(description)
        doc = foidocsdb.get(dochash)

        if doc is None:
            print "saving " + dochash
            edate = time().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
                , 'url': self.getURL(), 'docID': dochash,
                   "date": edate, "title": "Disclosure Log Updated", "description": description}
            foidocsdb.save(doc)
        else:
            print "already saved"


class GenericRSSDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        feed = feedparser.parse(content)
        for entry in feed.entries:
            #print entry
            print entry.id
            dochash = scrape.mkhash(entry.id)
            doc = foidocsdb.get(dochash)
            #print doc
            if doc is None:
                print "saving " + dochash
                edate = datetime.fromtimestamp(
                    mktime(entry.published_parsed)).strftime("%Y-%m-%d")
                doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
                       'url': entry.link, 'docID': entry.id,
                       "date": edate, "title": entry.title}
                self.getDescription(entry, entry, doc)
                foidocsdb.save(doc)
            else:
                print "already saved"

        def getDescription(self, content, entry, doc):
            """ get description from rss entry"""
            doc.update({'description': content.summary})

        return


class GenericOAICDisclogScraper(GenericDisclogScraper):
    __metaclass__ = abc.ABCMeta

    @abc.abstractmethod
    def getColumns(self, columns):
        """ rearranges columns if required """
        return

    def getColumnCount(self):
        return 5

    def getDescription(self, content, entry, doc):
        """ get description from rss entry"""
        descriptiontxt = ""
        for string in content.stripped_strings:
            descriptiontxt = descriptiontxt + " \n" + string
        doc.update({'description': descriptiontxt})

    def getTitle(self, content, entry, doc):
        doc.update({'title': (''.join(content.stripped_strings))})

    def getTable(self, soup):
        return soup.table

    def getRows(self, table):
        return table.find_all('tr')

    def getDate(self, content, entry, doc):
        date = ''.join(content.stripped_strings).strip()
        (a, b, c) = date.partition("(")
        date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012"))
        print date
        edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
        print edate
        doc.update({'date': edate})
        return

    def getLinks(self, content, entry, doc):
        links = []
        for atag in entry.find_all("a"):
            if atag.has_key('href'):
                links.append(scrape.fullurl(content, atag['href']))
        if links != []:
            doc.update({'links': links})
        return

    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        if content is not None:
            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
            # http://www.crummy.com/software/BeautifulSoup/documentation.html
                print "parsing"
                soup = BeautifulSoup(content)
                table = self.getTable(soup)
                for row in self.getRows(table):
                    columns = row.find_all('td')
                    if len(columns) is self.getColumnCount():
                        (id, date, title,
                         description, notes) = self.getColumns(columns)
                        print self.remove_control_chars(
                            ''.join(id.stripped_strings))
                        if id.string is None:
                            dochash = scrape.mkhash(
                                self.remove_control_chars(
                                    url + (''.join(date.stripped_strings))))
                        else:
                            dochash = scrape.mkhash(
                                self.remove_control_chars(
                                    url + (''.join(id.stripped_strings))))
                        doc = foidocsdb.get(dochash)

                        if doc is None:
                            print "saving " + dochash
                            doc = {'_id': dochash,
                                   'agencyID': self.getAgencyID(),
                                   'url': self.getURL(),
                                   'docID': (''.join(id.stripped_strings))}
                            self.getLinks(self.getURL(), row, doc)
                            self.getTitle(title, row, doc)
                            self.getDate(date, row, doc)
                            self.getDescription(description, row, doc)
                            if notes is not None:
                                doc.update({'notes': (
                                    ''.join(notes.stripped_strings))})
                            badtitles = ['-', 'Summary of FOI Request'
                                , 'FOI request(in summary form)'
                                , 'Summary of FOI request received by the ASC',
                                         'Summary of FOI request received by agency/minister',
                                         'Description of Documents Requested', 'FOI request',
                                         'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
                                         'Summary of FOIrequest received by agency/minister',
                                         'Summary of FOI request received', 'Description of    FOI Request',
                                         "FOI request", 'Results 1 to 67 of 67']
                            if doc['title'] not in badtitles\
                            and doc['description'] != '':
                                print "saving"
                                foidocsdb.save(doc)
                        else:
                            print "already saved " + dochash

                    elif len(row.find_all('th')) is self.getColumnCount():
                        print "header row"

                    else:
                        print "ERROR number of columns incorrect"
                        print row

