| import sys,os | import sys |
| import os | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) | sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
| import scrape | import scrape |
| from bs4 import BeautifulSoup | from bs4 import BeautifulSoup |
| from time import mktime | from time import mktime |
| import feedparser | import feedparser |
| import abc | import abc |
| import unicodedata, re | import unicodedata |
| import re | |
| import dateutil | import dateutil |
| from dateutil.parser import * | from dateutil.parser import * |
| from datetime import * | from datetime import * |
| import codecs | import codecs |
| from StringIO import StringIO | from StringIO import StringIO |
| from docx import * | from docx import * |
| from lxml import etree | from lxml import etree |
| import zipfile | import zipfile |
| from pdfminer.pdfparser import PDFDocument, PDFParser | from pdfminer.pdfparser import PDFDocument, PDFParser |
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf |
| from pdfminer.pdfdevice import PDFDevice, TagExtractor | from pdfminer.pdfdevice import PDFDevice, TagExtractor |
| from pdfminer.converter import TextConverter | from pdfminer.converter import TextConverter |
| from pdfminer.cmapdb import CMapDB | from pdfminer.cmapdb import CMapDB |
| from pdfminer.layout import LAParams | from pdfminer.layout import LAParams |
| class GenericDisclogScraper(object): | class GenericDisclogScraper(object): |
| __metaclass__ = abc.ABCMeta | __metaclass__ = abc.ABCMeta |
| agencyID = None | agencyID = None |
| disclogURL = None | disclogURL = None |
| def remove_control_chars(self, input): | |
| return "".join([i for i in input if ord(i) in range(32, 127)]) | def remove_control_chars(self, input): |
| def getAgencyID(self): | return "".join([i for i in input if ord(i) in range(32, 127)]) |
| """ disclosr agency id """ | |
| if self.agencyID == None: | def getAgencyID(self): |
| self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") | """ disclosr agency id """ |
| return self.agencyID | if self.agencyID is None: |
| self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") | |
| def getURL(self): | return self.agencyID |
| """ disclog URL""" | |
| if self.disclogURL == None: | def getURL(self): |
| agency = scrape.agencydb.get(self.getAgencyID()) | """ disclog URL""" |
| self.disclogURL = agency['FOIDocumentsURL'] | if self.disclogURL is None: |
| return self.disclogURL | agency = scrape.agencydb.get(self.getAgencyID()) |
| self.disclogURL = agency['FOIDocumentsURL'] | |
| @abc.abstractmethod | return self.disclogURL |
| def doScrape(self): | |
| """ do the scraping """ | @abc.abstractmethod |
| return | def doScrape(self): |
| """ do the scraping """ | |
| return | |
| class GenericPDFDisclogScraper(GenericDisclogScraper): | class GenericPDFDisclogScraper(GenericDisclogScraper): |
| def doScrape(self): | def doScrape(self): |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | foidocsdb = scrape.couch['disclosr-foidocuments'] |
| (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, |
| self.getURL(), "foidocuments", self.getAgencyID()) | |
| laparams = LAParams() | laparams = LAParams() |
| rsrcmgr = PDFResourceManager(caching=True) | |
| rsrcmgr = PDFResourceManager(caching=True) | |
| outfp = StringIO.StringIO() | outfp = StringIO.StringIO() |
| device = TextConverter(rsrcmgr, outfp, codec='utf-8', | |
| device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) | laparams=laparams) |
| fp = StringIO.StringIO() | fp = StringIO.StringIO() |
| fp.write(content) | fp.write(content) |
| description = output.getvalue(); | description = output.getvalue() |
| process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True) | process_pdf(rsrcmgr, device, fp, set(), caching=True, |
| check_extractable=True) | |
| fp.close() | fp.close() |
| device.close() | device.close() |
| outfp.close() | outfp.close() |
| dochash = scrape.mkhash(description) | |
| hash = scrape.mkhash(description) | doc = foidocsdb.get(dochash) |
| #print hash | if doc is None: |
| doc = foidocsdb.get(hash) | print "saving " + dochash |
| #print doc | edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") |
| if doc == None: | doc = {'_id': dochash, 'agencyID': self.getAgencyID() |
| print "saving "+ hash | , 'url': self.getURL(), 'docID': dochash, |
| edate = datetime.fromtimestamp(mktime( )).strftime("%Y-%m-%d") | "date": edate, "title": "Disclosure Log Updated"} |
| doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash, | self.getDescription(entry, entry, doc) |
| "date": edate,"title": "Disclosure Log Updated"} | foidocsdb.save(doc) |
| self.getDescription(entry,entry, doc) | else: |
| foidocsdb.save(doc) | print "already saved" |
| class GenericDOCXDisclogScraper(GenericDisclogScraper): | |
| def doScrape(self): | |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | |
| (url, mime_type, content) = scrape.fetchURL(scrape.docsdb | |
| , self.getURL(), "foidocuments", self.getAgencyID()) | |
| mydoc = zipfile.ZipFile(file) | |
| xmlcontent = mydoc.read('word/document.xml') | |
| document = etree.fromstring(xmlcontent) | |
| ## Fetch all the text out of the document we just created | |
| paratextlist = getdocumenttext(document) | |
| # Make explicit unicode version | |
| newparatextlist = [] | |
| for paratext in paratextlist: | |
| newparatextlist.append(paratext.encode("utf-8")) | |
| ## Print our documnts test with two newlines under each paragraph | |
| description = '\n\n'.join(newparatextlist) | |
| dochash = scrape.mkhash(description) | |
| doc = foidocsdb.get(dochash) | |
| if doc is None: | |
| print "saving " + dochash | |
| edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") | |
| doc = {'_id': dochash, 'agencyID': self.getAgencyID() | |
| , 'url': self.getURL(), 'docID': dochash, | |
| "date": edate, "title": "Disclosure Log Updated"} | |
| self.getDescription(entry, entry, doc) | |
| foidocsdb.save(doc) | |
| else: | |
| print "already saved" | |
| class GenericRSSDisclogScraper(GenericDisclogScraper): | |
| def doScrape(self): | |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | |
| (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | |
| self.getURL(), "foidocuments", self.getAgencyID()) | |
| feed = feedparser.parse(content) | |
| for entry in feed.entries: | |
| #print entry | |
| print entry.id | |
| dochash = scrape.mkhash(entry.id) | |
| doc = foidocsdb.get(dochash) | |
| #print doc | |
| if doc is None: | |
| print "saving " + dochash | |
| edate = datetime.fromtimestamp( | |
| mktime(entry.published_parsed)).strftime("%Y-%m-%d") | |
| doc = {'_id': dochash, 'agencyID': self.getAgencyID(), | |
| 'url': entry.link, 'docID': entry.id, | |
| "date": edate, "title": entry.title} | |
| self.getDescription(entry, entry, doc) | |
| foidocsdb.save(doc) | |
| else: | |
| print "already saved" | |
| def getDescription(self, content, entry, doc): | |
| """ get description from rss entry""" | |
| doc.update({'description': content.summary}) | |
| return | |
| class GenericOAICDisclogScraper(GenericDisclogScraper): | |
| __metaclass__ = abc.ABCMeta | |
| @abc.abstractmethod | |
| def getColumns(self, columns): | |
| """ rearranges columns if required """ | |
| return | |
| def getColumnCount(self): | |
| return 5 | |
| def getDescription(self, content, entry, doc): | |
| """ get description from rss entry""" | |
| descriptiontxt = "" | |
| for string in content.stripped_strings: | |
| descriptiontxt = descriptiontxt + " \n" + string | |
| doc.update({'description': descriptiontxt}) | |
| def getTitle(self, content, entry, doc): | |
| doc.update({'title': (''.join(content.stripped_strings))}) | |
| def getTable(self, soup): | |
| return soup.table | |
| def getRows(self, table): | |
| return table.find_all('tr') | |
| def getDate(self, content, entry, doc): | |
| date = ''.join(content.stripped_strings).strip() | |
| (a, b, c) = date.partition("(") | |
| date = self.remove_control_chars(a.replace("Octber", "October")) | |
| print date | |
| edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") | |
| print edate | |
| doc.update({'date': edate}) | |
| return | |
| def getLinks(self, content, entry, doc): | |
| links = [] | |
| for atag in entry.find_all("a"): | |
| if atag.has_key('href'): | |
| links.append(scrape.fullurl(content, atag['href'])) | |
| if links != []: | |
| doc.update({'links': links}) | |
| return | |
| def doScrape(self): | |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | |
| (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, | |
| self.getURL(), "foidocuments", self.getAgencyID()) | |
| if content is not None: | |
| if mime_type is "text/html"\ | |
| or mime_type is "application/xhtml+xml"\ | |
| or mime_type is"application/xml": | |
| # http://www.crummy.com/software/BeautifulSoup/documentation.html | |
| soup = BeautifulSoup(content) | |
| table = self.getTable(soup) | |
| for row in self.getRows(table): | |
| columns = row.find_all('td') | |
| if len(columns) is self.getColumnCount(): | |
| (id, date, title, | |
| description, notes) = self.getColumns(columns) | |
| print self.remove_control_chars( | |
| ''.join(id.stripped_strings)) | |
| if id.string is None: | |
| dochash = scrape.mkhash( | |
| self.remove_control_chars( | |
| url + (''.join(date.stripped_strings)))) | |
| else: | else: |
| print "already saved" | dochash = scrape.mkhash( |
| self.remove_control_chars( | |
| url + (''.join(id.stripped_strings)))) | |
| class GenericDOCXDisclogScraper(GenericDisclogScraper): | doc = foidocsdb.get(hash) |
| def doScrape(self): | if doc is None: |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | print "saving " + hash |
| (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | doc = {'_id': hash, |
| 'agencyID': self.getAgencyID(), | |
| mydoc = zipfile.ZipFile(file) | 'url': self.getURL(), |
| xmlcontent = mydoc.read('word/document.xml') | 'docID': (''.join(id.stripped_strings))} |
| document = etree.fromstring(xmlcontent) | self.getLinks(self.getURL(), row, doc) |
| self.getTitle(title, row, doc) | |
| ## Fetch all the text out of the document we just created | self.getDate(date, row, doc) |
| paratextlist = getdocumenttext(document) | self.getDescription(description, row, doc) |
| if notes is not None: | |
| # Make explicit unicode version | doc.update({ 'notes': ( |
| newparatextlist = [] | ''.join(notes.stripped_strings))}) |
| for paratext in paratextlist: | badtitles = ['-','Summary of FOI Request' |
| newparatextlist.append(paratext.encode("utf-8")) | , 'FOI request(in summary form)' |
| , 'Summary of FOI request received by the ASC', | |
| ## Print our documnts test with two newlines under each paragraph | 'Summary of FOI request received by agency/minister', |
| description = '\n\n'.join(newparatextlist) | 'Description of Documents Requested','FOI request', |
| 'Description of FOI Request','Summary of request','Description','Summary', | |
| hash = scrape.mkhash(description) | |
| #print hash | |
| doc = foidocsdb.get(hash) | |
| #print doc | |
| if doc == None: | |
| print "saving "+ hash | |
| edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") | |
| doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash, | |
| "date": edate,"title": "Disclosure Log Updated"} | |
| self.getDescription(entry,entry, doc) | |
| foidocsdb.save(doc) | |
| else: | |
| print "already saved" | |
| class GenericRSSDisclogScraper(GenericDisclogScraper): | |
| def doScrape(self): | |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | |
| (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | |
| feed = feedparser.parse(content) | |
| for entry in feed.entries: | |
| #print entry | |
| print entry.id | |
| hash = scrape.mkhash(entry.id) | |
| #print hash | |
| doc = foidocsdb.get(hash) | |
| #print doc | |
| if doc == None: | |
| print "saving "+ hash | |
| edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") | |
| doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, | |
| "date": edate,"title": entry.title} | |
| self.getDescription(entry,entry, doc) | |
| foidocsdb.save(doc) | |
| else: | |
| print "already saved" | |
| def getDescription(self, content, entry, doc): | |
| """ get description from rss entry""" | |
| doc.update({'description': content.summary}) | |
| return | |
| class GenericOAICDisclogScraper(GenericDisclogScraper): | |
| __metaclass__ = abc.ABCMeta | |
| @abc.abstractmethod | |
| def getColumns(self,columns): | |
| """ rearranges columns if required """ | |
| return | |
| def getColumnCount(self): | |
| return 5 | |
| def getDescription(self, content, entry, doc): | |
| """ get description from rss entry""" | |
| descriptiontxt = "" | |
| for string in content.stripped_strings: | |
| descriptiontxt = descriptiontxt + " \n" + string | |
| doc.update({'description': descriptiontxt}) | |
| return | |
| def getTitle(self, content, entry, doc): | |
| doc.update({'title': (''.join(content.stripped_strings))}) | |
| return | |
| def getTable(self, soup): | |
| return soup.table | |
| def getRows(self, table): | |
| return table.find_all('tr') | |
| def getDate(self, content, entry, doc): | |
| date = ''.join(content.stripped_strings).strip() | |
| (a,b,c) = date.partition("(") | |
| date = self.remove_control_chars(a.replace("Octber","October")) | |
| print date | |
| edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") | |
| print edate | |
| doc.update({'date': edate}) | |
| return | |
| def getLinks(self, content, entry, doc): | |
| links = [] | |
| for atag in entry.find_all("a"): | |
| if atag.has_key('href'): | |
| links.append(scrape.fullurl(content,atag['href'])) | |
| if links != []: | |
| doc.update({'links': links}) | |
| return | |
| def doScrape(self): | |
| foidocsdb = scrape.couch['disclosr-foidocuments'] | |
| (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) | |
| if content != None: | |
| if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": | |
| # http://www.crummy.com/software/BeautifulSoup/documentation.html | |
| soup = BeautifulSoup(content) | |
| table = self.getTable(soup) | |
| for row in self.getRows(table): | |
| columns = row.find_all('td') | |
| if len(columns) == self.getColumnCount(): |