From: Maxious Date: Mon, 03 Dec 2012 08:41:38 +0000 Subject: codestyle X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=871c02c98ffe59a60559ab0d78b8ef98bf60166f --- codestyle Former-commit-id: e85a6fcfab76f7ea0b140471a810430aa544e81d --- --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,11 +1,13 @@ -import sys,os +import sys +import os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup from time import mktime import feedparser import abc -import unicodedata, re +import unicodedata +import re import dateutil from dateutil.parser import * from datetime import * @@ -24,212 +26,235 @@ from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams + class GenericDisclogScraper(object): - __metaclass__ = abc.ABCMeta - agencyID = None - disclogURL = None - def remove_control_chars(self, input): - return "".join([i for i in input if ord(i) in range(32, 127)]) - def getAgencyID(self): - """ disclosr agency id """ - if self.agencyID == None: - self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") - return self.agencyID - - def getURL(self): - """ disclog URL""" - if self.disclogURL == None: - agency = scrape.agencydb.get(self.getAgencyID()) - self.disclogURL = agency['FOIDocumentsURL'] - return self.disclogURL - - @abc.abstractmethod - def doScrape(self): - """ do the scraping """ - return + __metaclass__ = abc.ABCMeta + agencyID = None + disclogURL = None + + def remove_control_chars(self, input): + return "".join([i for i in input if ord(i) in range(32, 127)]) + + def getAgencyID(self): + """ disclosr agency id """ + if self.agencyID is None: + self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") + return self.agencyID + + def getURL(self): + """ disclog URL""" + if self.disclogURL is None: + agency = scrape.agencydb.get(self.getAgencyID()) + self.disclogURL = agency['FOIDocumentsURL'] + return self.disclogURL + + @abc.abstractmethod + def doScrape(self): + """ do the scraping """ + return + class GenericPDFDisclogScraper(GenericDisclogScraper): - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - - laparams = LAParams() - - rsrcmgr = PDFResourceManager(caching=True) - + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + laparams = LAParams() + rsrcmgr = PDFResourceManager(caching=True) outfp = StringIO.StringIO() - - device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) - - + device = TextConverter(rsrcmgr, outfp, codec='utf-8', + laparams=laparams) fp = StringIO.StringIO() fp.write(content) - description = output.getvalue(); - process_pdf(rsrcmgr, device, fp, set(), caching=True, check_extractable=True) + description = output.getvalue() + process_pdf(rsrcmgr, device, fp, set(), caching=True, + check_extractable=True) fp.close() - device.close() - outfp.close() - - hash = scrape.mkhash(description) - #print hash - doc = foidocsdb.get(hash) - #print doc - if doc == None: - print "saving "+ hash - edate = datetime.fromtimestamp(mktime( )).strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash, - "date": edate,"title": "Disclosure Log Updated"} - self.getDescription(entry,entry, doc) - foidocsdb.save(doc) + device.close() + outfp.close() + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated"} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + +class GenericDOCXDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb + , self.getURL(), "foidocuments", self.getAgencyID()) + mydoc = zipfile.ZipFile(file) + xmlcontent = mydoc.read('word/document.xml') + document = etree.fromstring(xmlcontent) + ## Fetch all the text out of the document we just created + paratextlist = getdocumenttext(document) + # Make explicit unicode version + newparatextlist = [] + for paratext in paratextlist: + newparatextlist.append(paratext.encode("utf-8")) + ## Print our documnts test with two newlines under each paragraph + description = '\n\n'.join(newparatextlist) + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated"} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + +class GenericRSSDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + dochash = scrape.mkhash(entry.id) + doc = foidocsdb.get(dochash) + #print doc + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp( + mktime(entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), + 'url': entry.link, 'docID': entry.id, + "date": edate, "title": entry.title} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + doc.update({'description': content.summary}) + return + + +class GenericOAICDisclogScraper(GenericDisclogScraper): + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def getColumns(self, columns): + """ rearranges columns if required """ + return + + def getColumnCount(self): + return 5 + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + descriptiontxt = "" + for string in content.stripped_strings: + descriptiontxt = descriptiontxt + " \n" + string + doc.update({'description': descriptiontxt}) + + def getTitle(self, content, entry, doc): + doc.update({'title': (''.join(content.stripped_strings))}) + + def getTable(self, soup): + return soup.table + + def getRows(self, table): + return table.find_all('tr') + + def getDate(self, content, entry, doc): + date = ''.join(content.stripped_strings).strip() + (a, b, c) = date.partition("(") + date = self.remove_control_chars(a.replace("Octber", "October")) + print date + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + + def getLinks(self, content, entry, doc): + links = [] + for atag in entry.find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(content, atag['href'])) + if links != []: + doc.update({'links': links}) + return + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + if content is not None: + if mime_type is "text/html"\ + or mime_type is "application/xhtml+xml"\ + or mime_type is"application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(content) + table = self.getTable(soup) + for row in self.getRows(table): + columns = row.find_all('td') + if len(columns) is self.getColumnCount(): + (id, date, title, + description, notes) = self.getColumns(columns) + print self.remove_control_chars( + ''.join(id.stripped_strings)) + if id.string is None: + dochash = scrape.mkhash( + self.remove_control_chars( + url + (''.join(date.stripped_strings)))) else: - print "already saved" - - -class GenericDOCXDisclogScraper(GenericDisclogScraper): - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - - mydoc = zipfile.ZipFile(file) - xmlcontent = mydoc.read('word/document.xml') - document = etree.fromstring(xmlcontent) - - ## Fetch all the text out of the document we just created - paratextlist = getdocumenttext(document) - - # Make explicit unicode version - newparatextlist = [] - for paratext in paratextlist: - newparatextlist.append(paratext.encode("utf-8")) - - ## Print our documnts test with two newlines under each paragraph - description = '\n\n'.join(newparatextlist) - - hash = scrape.mkhash(description) - #print hash - doc = foidocsdb.get(hash) - #print doc - if doc == None: - print "saving "+ hash - edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': hash, - "date": edate,"title": "Disclosure Log Updated"} - self.getDescription(entry,entry, doc) - foidocsdb.save(doc) - else: - print "already saved" - - -class GenericRSSDisclogScraper(GenericDisclogScraper): - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - feed = feedparser.parse(content) - for entry in feed.entries: - #print entry - print entry.id - hash = scrape.mkhash(entry.id) - #print hash - doc = foidocsdb.get(hash) - #print doc - if doc == None: - print "saving "+ hash - edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, - "date": edate,"title": entry.title} - self.getDescription(entry,entry, doc) - foidocsdb.save(doc) - else: - print "already saved" - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - doc.update({'description': content.summary}) - return - -class GenericOAICDisclogScraper(GenericDisclogScraper): - __metaclass__ = abc.ABCMeta - @abc.abstractmethod - def getColumns(self,columns): - """ rearranges columns if required """ - return - def getColumnCount(self): - return 5 - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - descriptiontxt = "" - for string in content.stripped_strings: - descriptiontxt = descriptiontxt + " \n" + string - doc.update({'description': descriptiontxt}) - return - def getTitle(self, content, entry, doc): - doc.update({'title': (''.join(content.stripped_strings))}) - return - def getTable(self, soup): - return soup.table - def getRows(self, table): - return table.find_all('tr') - def getDate(self, content, entry, doc): - date = ''.join(content.stripped_strings).strip() - (a,b,c) = date.partition("(") - date = self.remove_control_chars(a.replace("Octber","October")) - print date - edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - print edate - doc.update({'date': edate}) - return - def getLinks(self, content, entry, doc): - links = [] - for atag in entry.find_all("a"): - if atag.has_key('href'): - links.append(scrape.fullurl(content,atag['href'])) - if links != []: - doc.update({'links': links}) - return - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - if content != None: - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(content) - table = self.getTable(soup) - for row in self.getRows(table): - columns = row.find_all('td') - if len(columns) == self.getColumnCount(): - (id, date, title, description, notes) = self.getColumns(columns) - print self.remove_control_chars(''.join(id.stripped_strings)) - if id.string == None: - hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) - else: - hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) - - if doc == None: - print "saving " +hash - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))} - self.getLinks(self.getURL(),row,doc) - self.getTitle(title,row, doc) - self.getDate(date,row, doc) - self.getDescription(description,row, doc) - if notes != None: - doc.update({ 'notes': (''.join(notes.stripped_strings))}) - badtitles = ['-','Summary of FOI Request','FOI request(in summary form)','Summary of FOI request received by the ASC', -'Summary of FOI request received by agency/minister','Description of Documents Requested','FOI request','Description of FOI Request','Summary of request','Description','Summary', + dochash = scrape.mkhash( + self.remove_control_chars( + url + (''.join(id.stripped_strings)))) + doc = foidocsdb.get(hash) + + if doc is None: + print "saving " + hash + doc = {'_id': hash, + 'agencyID': self.getAgencyID(), + 'url': self.getURL(), + 'docID': (''.join(id.stripped_strings))} + self.getLinks(self.getURL(), row, doc) + self.getTitle(title, row, doc) + self.getDate(date, row, doc) + self.getDescription(description, row, doc) + if notes is not None: + doc.update({ 'notes': ( + ''.join(notes.stripped_strings))}) + badtitles = ['-','Summary of FOI Request' + , 'FOI request(in summary form)' + , 'Summary of FOI request received by the ASC', +'Summary of FOI request received by agency/minister', +'Description of Documents Requested','FOI request', +'Description of FOI Request','Summary of request','Description','Summary', 'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67'] - if doc['title'] not in badtitles and doc['description'] != '': + if doc['title'] not in badtitles\ + and doc['description'] != '': print "saving" foidocsdb.save(doc) - else: - print "already saved "+hash - - elif len(row.find_all('th')) == self.getColumnCount(): - print "header row" - - else: - print "ERROR number of columns incorrect" - print row - + else: + print "already saved " + dochash + + elif len(row.find_all('th')) is self.getColumnCount(): + print "header row" + + else: + print "ERROR number of columns incorrect" + print row + --- a/documents/rss.xml.php +++ b/documents/rss.xml.php @@ -12,9 +12,9 @@ $TestFeed->setTitle('disclosurelo.gs Newest Entries - All'); $TestFeed->setLink('http://disclosurelo.gs/rss.xml.php'); $TestFeed->setDescription('disclosurelo.gs Newest Entries - All Agencies'); - $TestFeed->setChannelElement('language', 'en-us'); - $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time())); - +$TestFeed->setChannelElement('language', 'en-us'); +$TestFeed->setChannelElement('pubDate', date(DATE_RSS, time())); + //Retriving informations from database $idtoname = Array(); $agenciesdb = $server->get_db('disclosr-agencies'); @@ -22,18 +22,18 @@ $idtoname[$row->id] = trim($row->value->name); } $foidocsdb = $server->get_db('disclosr-foidocuments'); -$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00', 50), true)->rows; +$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99', '0000-00-00', 50), true)->rows; //print_r($rows); foreach ($rows as $row) { //Create an empty FeedItem $newItem = $TestFeed->createNewItem(); //Add elements to the feed item $newItem->setTitle($row->value->title); - $newItem->setLink("http://disclosurelo.gs/view.php?id=".$row->value->_id); + $newItem->setLink("http://disclosurelo.gs/view.php?id=" . $row->value->_id); $newItem->setDate(strtotime($row->value->date)); - $newItem->setDescription(displayLogEntry($row,$idtoname)); + $newItem->setDescription(displayLogEntry($row, $idtoname)); $newItem->setAuthor($idtoname[$row->value->agencyID]); - $newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=".$row->value->_id,array('isPermaLink'=>'true')); + $newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=" . $row->value->_id, array('isPermaLink' => 'true')); //Now add the feed item $TestFeed->addItem($newItem); } --- a/documents/template.inc.php +++ b/documents/template.inc.php @@ -98,9 +98,12 @@ _gaq.push(['_trackPageview']); (function() { - var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; + var ga = document.createElement('script'); + ga.type = 'text/javascript'; + ga.async = true; ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; - var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(ga, s); })(); @@ -146,9 +149,12 @@ function displayLogEntry($row, $idtoname) { $result = ""; $result .= '
'; - $result .= '

: " . truncate($row->value->title, 120).""; + $result .= '

: " . truncate($row->value->title, 120) . ""; $result .= '(' . $idtoname[$row->value->agencyID] . ')

'; - $result .= "

Title" . $row->value->title . "
" . str_replace("\n", "
", $row->value->description); + $result .= "

Title" . $row->value->title . "
"; + if (isset($row->value->description)) { + $result .= str_replace("\n", "
", $row->value->description); + } if (isset($row->value->notes)) { $result .= "
Note: " . $row->value->notes; } @@ -157,7 +163,7 @@ if (isset($row->value->links)) { $result .= '

Links/Documents

";