--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,5 +1,6 @@ import sys import os + sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup @@ -51,12 +52,12 @@ """ do the scraping """ return + class GenericHTMLDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) + self.getURL(), "foidocuments", self.getAgencyID()) content = rcontent dochash = scrape.mkhash(content) doc = foidocsdb.get(dochash) @@ -66,33 +67,32 @@ last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) if last_attach != None: html_diff = difflib.HtmlDiff() - description = description + "\nChanges: " - description = description + html_diff.make_table(last_attach.read().split('\n'), - content.split('\n')) + diff = html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description, "diff": diff} foidocsdb.save(doc) else: print "already saved" + class GenericPDFDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) + self.getURL(), "foidocuments", self.getAgencyID()) laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec='utf-8', - laparams=laparams) + laparams=laparams) fp = StringIO() fp.write(content) process_pdf(rsrcmgr, device, fp, set(), caching=True, - check_extractable=True) + check_extractable=True) description = outfp.getvalue() fp.close() device.close() @@ -103,19 +103,18 @@ print "saving " + dochash edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" class GenericDOCXDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb - , self.getURL(), "foidocuments", self.getAgencyID()) + , self.getURL(), "foidocuments", self.getAgencyID()) mydoc = zipfile.ZipFile(file) xmlcontent = mydoc.read('word/document.xml') document = etree.fromstring(xmlcontent) @@ -125,7 +124,7 @@ newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) - ## Print our documnts test with two newlines under each paragraph + ## Print our documnts test with two newlines under each paragraph description = '\n\n'.join(newparatextlist).strip(' \t\n\r') dochash = scrape.mkhash(description) doc = foidocsdb.get(dochash) @@ -134,42 +133,42 @@ print "saving " + dochash edate = time().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" class GenericRSSDisclogScraper(GenericDisclogScraper): - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) - feed = feedparser.parse(content) - for entry in feed.entries: - #print entry - print entry.id - dochash = scrape.mkhash(entry.id) - doc = foidocsdb.get(dochash) - #print doc - if doc is None: - print "saving " + dochash - edate = datetime.fromtimestamp( - mktime(entry.published_parsed)).strftime("%Y-%m-%d") - doc = {'_id': dochash, 'agencyID': self.getAgencyID(), - 'url': entry.link, 'docID': entry.id, - "date": edate, "title": entry.title} - self.getDescription(entry, entry, doc) - foidocsdb.save(doc) - else: - print "already saved" - - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - doc.update({'description': content.summary}) - return + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + dochash = scrape.mkhash(entry.id) + doc = foidocsdb.get(dochash) + #print doc + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp( + mktime(entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), + 'url': entry.link, 'docID': entry.id, + "date": edate, "title": entry.title} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + doc.update({'description': content.summary}) + + return class GenericOAICDisclogScraper(GenericDisclogScraper): @@ -187,7 +186,7 @@ """ get description from rss entry""" descriptiontxt = "" for string in content.stripped_strings: - descriptiontxt = descriptiontxt + " \n" + string + descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) def getTitle(self, content, entry, doc): @@ -215,7 +214,7 @@ if atag.has_key('href'): links.append(scrape.fullurl(content, atag['href'])) if links != []: - doc.update({'links': links}) + doc.update({'links': links}) return def doScrape(self): @@ -232,7 +231,7 @@ columns = row.find_all('td') if len(columns) is self.getColumnCount(): (id, date, title, - description, notes) = self.getColumns(columns) + description, notes) = self.getColumns(columns) print self.remove_control_chars( ''.join(id.stripped_strings)) if id.string is None: @@ -248,27 +247,29 @@ if doc is None: print "saving " + dochash doc = {'_id': dochash, - 'agencyID': self.getAgencyID(), - 'url': self.getURL(), - 'docID': (''.join(id.stripped_strings))} + 'agencyID': self.getAgencyID(), + 'url': self.getURL(), + 'docID': (''.join(id.stripped_strings))} self.getLinks(self.getURL(), row, doc) self.getTitle(title, row, doc) self.getDate(date, row, doc) self.getDescription(description, row, doc) if notes is not None: - doc.update({ 'notes': ( + doc.update({'notes': ( ''.join(notes.stripped_strings))}) - badtitles = ['-','Summary of FOI Request' - , 'FOI request(in summary form)' - , 'Summary of FOI request received by the ASC', -'Summary of FOI request received by agency/minister', -'Description of Documents Requested','FOI request', -'Description of FOI Request','Summary of request','Description','Summary', -'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67'] + badtitles = ['-', 'Summary of FOI Request' + , 'FOI request(in summary form)' + , 'Summary of FOI request received by the ASC', + 'Summary of FOI request received by agency/minister', + 'Description of Documents Requested', 'FOI request', + 'Description of FOI Request', 'Summary of request', 'Description', 'Summary', + 'Summary of FOIrequest received by agency/minister', + 'Summary of FOI request received', 'Description of FOI Request', + "FOI request", 'Results 1 to 67 of 67'] if doc['title'] not in badtitles\ and doc['description'] != '': - print "saving" - foidocsdb.save(doc) + print "saving" + foidocsdb.save(doc) else: print "already saved " + dochash