--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,5 +1,6 @@ import sys import os + sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup @@ -51,12 +52,12 @@ """ do the scraping """ return + class GenericHTMLDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) + self.getURL(), "foidocuments", self.getAgencyID()) content = rcontent dochash = scrape.mkhash(content) doc = foidocsdb.get(dochash) @@ -66,33 +67,33 @@ last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) if last_attach != None: html_diff = difflib.HtmlDiff() - description = description + "\nChanges: " - description = description + html_diff.make_table(last_attach.read().split('\n'), - content.split('\n')) + diff = html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)} + "date": edate, "title": "Disclosure Log Updated", + "description": self.remove_control_chars(description), "diff": self.remove_control_chars(diff)} foidocsdb.save(doc) else: print "already saved" + class GenericPDFDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) + self.getURL(), "foidocuments", self.getAgencyID()) laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec='utf-8', - laparams=laparams) + laparams=laparams) fp = StringIO() fp.write(content) process_pdf(rsrcmgr, device, fp, set(), caching=True, - check_extractable=True) + check_extractable=True) description = outfp.getvalue() fp.close() device.close() @@ -111,11 +112,10 @@ class GenericDOCXDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb - , self.getURL(), "foidocuments", self.getAgencyID()) + , self.getURL(), "foidocuments", self.getAgencyID()) mydoc = zipfile.ZipFile(file) xmlcontent = mydoc.read('word/document.xml') document = etree.fromstring(xmlcontent) @@ -125,7 +125,7 @@ newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) - ## Print our documnts test with two newlines under each paragraph + ## Print our documnts test with two newlines under each paragraph description = '\n\n'.join(newparatextlist).strip(' \t\n\r') dochash = scrape.mkhash(description) doc = foidocsdb.get(dochash) @@ -134,42 +134,42 @@ print "saving " + dochash edate = time().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" class GenericRSSDisclogScraper(GenericDisclogScraper): - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) - feed = feedparser.parse(content) - for entry in feed.entries: - #print entry - print entry.id - dochash = scrape.mkhash(entry.id) - doc = foidocsdb.get(dochash) - #print doc - if doc is None: - print "saving " + dochash - edate = datetime.fromtimestamp( - mktime(entry.published_parsed)).strftime("%Y-%m-%d") - doc = {'_id': dochash, 'agencyID': self.getAgencyID(), - 'url': entry.link, 'docID': entry.id, - "date": edate, "title": entry.title} - self.getDescription(entry, entry, doc) - foidocsdb.save(doc) - else: - print "already saved" - - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - doc.update({'description': content.summary}) - return + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + dochash = scrape.mkhash(entry.id) + doc = foidocsdb.get(dochash) + #print doc + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp( + mktime(entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), + 'url': entry.link, 'docID': entry.id, + "date": edate, "title": entry.title} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + doc.update({'description': content.summary}) + + return class GenericOAICDisclogScraper(GenericDisclogScraper): @@ -187,7 +187,7 @@ """ get description from rss entry""" descriptiontxt = "" for string in content.stripped_strings: - descriptiontxt = descriptiontxt + " \n" + string + descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) def getTitle(self, content, entry, doc): @@ -198,13 +198,31 @@ def getRows(self, table): return table.find_all('tr') + def findColumns(self, row): + return row.find_all('td') + + def getDocHash(self, id,date, url): + if id.string is None: + print "no id, using date as hash" + return scrape.mkhash( + self.remove_control_chars( + url + (''.join(date.stripped_strings)))) + else: + return scrape.mkhash( + self.remove_control_chars( + url + (''.join(id.stripped_strings)))) def getDate(self, content, entry, doc): - date = ''.join(content.stripped_strings).strip() - (a, b, c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012")) - print date - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + strdate = ''.join(content.stripped_strings).strip() + (a, b, c) = strdate.partition("(") + strdate = self.remove_control_chars(a.replace("Octber", "October").replace("1012","2012").replace("Janrurary", "January").replace("1012","2012")) + print strdate + try: + edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + except ValueError: + print >> sys.stderr, "ERROR date invalid %s " % strdate + print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip() + edate = date.today().strftime("%Y-%m-%d") print edate doc.update({'date': edate}) return @@ -215,7 +233,7 @@ if atag.has_key('href'): links.append(scrape.fullurl(content, atag['href'])) if links != []: - doc.update({'links': links}) + doc.update({'links': links}) return def doScrape(self): @@ -229,46 +247,40 @@ soup = BeautifulSoup(content) table = self.getTable(soup) for row in self.getRows(table): - columns = row.find_all('td') + columns = self.findColumns(row) if len(columns) is self.getColumnCount(): (id, date, title, - description, notes) = self.getColumns(columns) + description, notes) = self.getColumns(columns) print self.remove_control_chars( ''.join(id.stripped_strings)) - if id.string is None: - dochash = scrape.mkhash( - self.remove_control_chars( - url + (''.join(date.stripped_strings)))) - else: - dochash = scrape.mkhash( - self.remove_control_chars( - url + (''.join(id.stripped_strings)))) + dochash = self.getDocHash(id,date,url) doc = foidocsdb.get(dochash) if doc is None: print "saving " + dochash doc = {'_id': dochash, - 'agencyID': self.getAgencyID(), - 'url': self.getURL(), - 'docID': (''.join(id.stripped_strings))} + 'agencyID': self.getAgencyID(), + 'url': self.getURL(), + 'docID': (''.join(id.stripped_strings))} self.getLinks(self.getURL(), row, doc) self.getTitle(title, row, doc) self.getDate(date, row, doc) self.getDescription(description, row, doc) if notes is not None: - doc.update({ 'notes': ( + doc.update({'notes': ( ''.join(notes.stripped_strings))}) - badtitles = ['-','Summary of FOI Request' - , 'FOI request(in summary form)' - , 'Summary of FOI request received by the ASC', -'Summary of FOI request received by agency/minister', -'Description of Documents Requested','FOI request', -'Description of FOI Request','Summary of request','Description','Summary', -'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67'] - if doc['title'] not in badtitles\ - and doc['description'] != '': - print "saving" - foidocsdb.save(doc) + badtitles = ['-', 'Summary of FOI Request' + , 'FOI request(in summary form)' + , 'Summary of FOI request received by the ASC', + 'Summary of FOI request received by agency/minister', + 'Description of Documents Requested', 'FOI request', + 'Description of FOI Request', 'Summary of request', 'Description', 'Summary', + 'Summary of FOIrequest received by agency/minister', + 'Summary of FOI request received', 'Description of FOI Request', + "FOI request", 'Results 1 to 67 of 67'] + if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '': + print "saving" + foidocsdb.save(doc) else: print "already saved " + dochash @@ -276,6 +288,6 @@ print "header row" else: - print "ERROR number of columns incorrect" + print >> sys.stderr, "ERROR number of columns incorrect" print row