From: Maxious <maxious@lambdacomplex.org>
Date: Mon, 03 Dec 2012 12:05:38 +0000
Subject: fix scraping
X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=6b5dfd7e2271af6e25ef0285597ae9749a9c6392
---
fix scraping


Former-commit-id: c96cc5c23e3497cb03991f1ee4e2990548817cf3
---


--- /dev/null
+++ b/documents/disclosr-documents.nja
@@ -1,1 +1,7 @@
-
+{
+  "venv": "", 
+  "project-type": "Import from sources", 
+  "name": "disclosr-documents", 
+  "license": "GNU General Public License v3", 
+  "description": ""
+}

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,155 +1,254 @@
-import sys,os
+import sys
+import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
 import scrape
 from bs4 import BeautifulSoup
 from time import mktime
 import feedparser
 import abc
-import unicodedata, re
+import unicodedata
+import re
 import dateutil
 from dateutil.parser import *
 from datetime import *
 import codecs
 
+from StringIO import StringIO
+
+from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfdevice import PDFDevice, TagExtractor
+from pdfminer.converter import TextConverter
+from pdfminer.cmapdb import CMapDB
+from pdfminer.layout import LAParams
+
+
 class GenericDisclogScraper(object):
-        __metaclass__ = abc.ABCMeta
-	agencyID = None
-	disclogURL = None
-	def remove_control_chars(self, input):
-		return "".join([i for i in input if ord(i) in range(32, 127)])
-        def getAgencyID(self):
-                """ disclosr agency id """
-		if self.agencyID == None:
-			self.agencyID = os.path.basename(sys.argv[0]).replace(".py","")
-                return self.agencyID
-
-        def getURL(self):
-                """ disclog URL"""
-		if self.disclogURL == None:
-			agency = scrape.agencydb.get(self.getAgencyID())
-			self.disclogURL = agency['FOIDocumentsURL']
-                return self.disclogURL
-
-	@abc.abstractmethod
-	def doScrape(self):
-		""" do the scraping """
-		return
-
-	@abc.abstractmethod
-        def getDescription(self, content, entry, doc):
-                """ get description"""
-		return
-
+    __metaclass__ = abc.ABCMeta
+    agencyID = None
+    disclogURL = None
+
+    def remove_control_chars(self, input):
+        return "".join([i for i in input if ord(i) in range(32, 127)])
+
+    def getAgencyID(self):
+        """ disclosr agency id """
+        if self.agencyID is None:
+            self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
+        return self.agencyID
+
+    def getURL(self):
+        """ disclog URL"""
+        if self.disclogURL is None:
+            agency = scrape.agencydb.get(self.getAgencyID())
+            self.disclogURL = agency['FOIDocumentsURL']
+        return self.disclogURL
+
+    @abc.abstractmethod
+    def doScrape(self):
+        """ do the scraping """
+        return
+
+
+class GenericPDFDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+             self.getURL(), "foidocuments", self.getAgencyID())
+        laparams = LAParams()
+        rsrcmgr = PDFResourceManager(caching=True)
+        outfp = StringIO()
+        device = TextConverter(rsrcmgr, outfp, codec='utf-8',
+             laparams=laparams)
+        fp = StringIO()
+        fp.write(content.read())
+
+        process_pdf(rsrcmgr, device, fp, set(), caching=True,
+             check_extractable=True)
+        description = outfp.getvalue()
+        fp.close()
+        device.close()
+        outfp.close()
+        dochash = scrape.mkhash(description)
+        doc = foidocsdb.get(dochash)
+        if doc is None:
+            print "saving " + dochash
+            edate = date.today().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
+
+
+class GenericDOCXDisclogScraper(GenericDisclogScraper):
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
+        , self.getURL(), "foidocuments", self.getAgencyID())
+        mydoc = zipfile.ZipFile(file)
+        xmlcontent = mydoc.read('word/document.xml')
+        document = etree.fromstring(xmlcontent)
+        ## Fetch all the text out of the document we just created
+        paratextlist = getdocumenttext(document)
+        # Make explicit unicode version
+        newparatextlist = []
+        for paratext in paratextlist:
+            newparatextlist.append(paratext.encode("utf-8"))
+        ## Print our documnts test with two newlines under each paragraph
+        description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
+        dochash = scrape.mkhash(description)
+        doc = foidocsdb.get(dochash)
+
+        if doc is None:
+            print "saving " + dochash
+            edate = time().strftime("%Y-%m-%d")
+            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
+            , 'url': self.getURL(), 'docID': dochash,
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
+            foidocsdb.save(doc)
+        else:
+            print "already saved"
 
 
 class GenericRSSDisclogScraper(GenericDisclogScraper):
 
-       	def doScrape(self):
-               	foidocsdb = scrape.couch['disclosr-foidocuments']
-                (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
-		feed = feedparser.parse(content)		
-		for entry in feed.entries:
-			#print entry
-			print entry.id
-			hash = scrape.mkhash(entry.id)
-			#print hash
-		  	doc = foidocsdb.get(hash)
-			#print doc
-			if doc == None:
-                        	print "saving "+ hash
-				edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
-                                doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
-                                "date": edate,"title": entry.title}
-				self.getDescription(entry,entry, doc)
-                                foidocsdb.save(doc)
+        def doScrape(self):
+            foidocsdb = scrape.couch['disclosr-foidocuments']
+            (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+                 self.getURL(), "foidocuments", self.getAgencyID())
+            feed = feedparser.parse(content)
+            for entry in feed.entries:
+                #print entry
+                print entry.id
+                dochash = scrape.mkhash(entry.id)
+                doc = foidocsdb.get(dochash)
+                #print doc
+                if doc is None:
+                    print "saving " + dochash
+                    edate = datetime.fromtimestamp(
+                        mktime(entry.published_parsed)).strftime("%Y-%m-%d")
+                    doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
+                        'url': entry.link, 'docID': entry.id,
+                        "date": edate, "title": entry.title}
+                    self.getDescription(entry, entry, doc)
+                    foidocsdb.save(doc)
+                else:
+                    print "already saved"
+
+            def getDescription(self, content, entry, doc):
+                    """ get description from rss entry"""
+                    doc.update({'description': content.summary})
+            return
+
+
+class GenericOAICDisclogScraper(GenericDisclogScraper):
+    __metaclass__ = abc.ABCMeta
+
+    @abc.abstractmethod
+    def getColumns(self, columns):
+        """ rearranges columns if required """
+        return
+
+    def getColumnCount(self):
+        return 5
+
+    def getDescription(self, content, entry, doc):
+        """ get description from rss entry"""
+        descriptiontxt = ""
+        for string in content.stripped_strings:
+                    descriptiontxt = descriptiontxt + " \n" + string
+        doc.update({'description': descriptiontxt})
+
+    def getTitle(self, content, entry, doc):
+        doc.update({'title': (''.join(content.stripped_strings))})
+
+    def getTable(self, soup):
+        return soup.table
+
+    def getRows(self, table):
+        return table.find_all('tr')
+
+    def getDate(self, content, entry, doc):
+        date = ''.join(content.stripped_strings).strip()
+        (a, b, c) = date.partition("(")
+        date = self.remove_control_chars(a.replace("Octber", "October"))
+        print date
+        edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+        print edate
+        doc.update({'date': edate})
+        return
+
+    def getLinks(self, content, entry, doc):
+        links = []
+        for atag in entry.find_all("a"):
+            if atag.has_key('href'):
+                links.append(scrape.fullurl(content, atag['href']))
+        if links != []:
+                    doc.update({'links': links})
+        return
+
+    def doScrape(self):
+        foidocsdb = scrape.couch['disclosr-foidocuments']
+        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
+            self.getURL(), "foidocuments", self.getAgencyID())
+        if content is not None:
+            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
+            # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                print "parsing"
+                soup = BeautifulSoup(content)
+                table = self.getTable(soup)
+                for row in self.getRows(table):
+                    columns = row.find_all('td')
+                    if len(columns) is self.getColumnCount():
+                        (id, date, title,
+                        description, notes) = self.getColumns(columns)
+                        print self.remove_control_chars(
+                            ''.join(id.stripped_strings))
+                        if id.string is None:
+                            dochash = scrape.mkhash(
+                                self.remove_control_chars(
+                                    url + (''.join(date.stripped_strings))))
                         else:
-                        	print "already saved"			
-        def getDescription(self, content, entry, doc):
-                """ get description from rss entry"""
-                doc.update({'description': content.summary})
-		return
-
-class GenericOAICDisclogScraper(GenericDisclogScraper):
-        __metaclass__ = abc.ABCMeta
-	@abc.abstractmethod
-	def getColumns(self,columns):
-		""" rearranges columns if required """
-		return
-        def getColumnCount(self):
-                return 5
-        def getDescription(self, content, entry, doc):
-                """ get description from rss entry"""
-		descriptiontxt = ""
-		for string in content.stripped_strings:
-                	descriptiontxt = descriptiontxt + " \n" + string
-                doc.update({'description': descriptiontxt})
-		return
-        def getTitle(self, content, entry, doc):
-                doc.update({'title': (''.join(content.stripped_strings))})
-		return
-	def getTable(self, soup):
-		return soup.table
-	def getRows(self, table):
-		return table.find_all('tr')
-	def getDate(self, content, entry, doc):
-		date = ''.join(content.stripped_strings).strip()
-		(a,b,c) = date.partition("(")
-		date = self.remove_control_chars(a.replace("Octber","October"))
-		print date
-		edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
-		print edate
-		doc.update({'date': edate})
-		return
-	def getLinks(self, content, entry, doc):
-                links = []
-                for atag in entry.find_all("a"):
-                       	if atag.has_key('href'):
-                               	links.append(scrape.fullurl(content,atag['href']))
-                if links != []:
-	                doc.update({'links': links})
-		return
-
-	def doScrape(self):
-		foidocsdb = scrape.couch['disclosr-foidocuments']
-		(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID())
-		if content != None:
-			if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
-			# http://www.crummy.com/software/BeautifulSoup/documentation.html
-				soup = BeautifulSoup(content)
-				table = self.getTable(soup)
-				for row in self.getRows(table):
-					columns = row.find_all('td')
-					if len(columns) == self.getColumnCount():
-						(id, date, title, description, notes) = self.getColumns(columns)
-						print self.remove_control_chars(''.join(id.stripped_strings))
-						if id.string == None:
-							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
-						else:
-							hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
-						doc = foidocsdb.get(hash)
-							
-						if doc == None:
-							print "saving " +hash
-							doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}
-							self.getLinks(self.getURL(),row,doc)
-                                			self.getTitle(title,row, doc)
-                                			self.getDate(date,row, doc)
-							self.getDescription(description,row, doc)
-							if notes != None:
-                                        			doc.update({ 'notes': (''.join(notes.stripped_strings))})
-                                                        badtitles = ['-','Summary of FOI Request','FOI request(in summary form)','Summary of FOI request received by the ASC',
-'Summary of FOI request received by agency/minister','Description of Documents Requested','FOI request','Description of FOI Request','Summary of request','Description','Summary',
+                            dochash = scrape.mkhash(
+                                self.remove_control_chars(
+                                    url + (''.join(id.stripped_strings))))
+                        doc = foidocsdb.get(dochash)
+
+                        if doc is None:
+                            print "saving " + dochash
+                            doc = {'_id': dochash,
+                            'agencyID': self.getAgencyID(),
+                            'url': self.getURL(),
+                            'docID': (''.join(id.stripped_strings))}
+                            self.getLinks(self.getURL(), row, doc)
+                            self.getTitle(title, row, doc)
+                            self.getDate(date, row, doc)
+                            self.getDescription(description, row, doc)
+                            if notes is not None:
+                                doc.update({ 'notes': (
+                                    ''.join(notes.stripped_strings))})
+                            badtitles = ['-','Summary of FOI Request'
+                            , 'FOI request(in summary form)'
+                            , 'Summary of FOI request received by the ASC',
+'Summary of FOI request received by agency/minister',
+'Description of Documents Requested','FOI request',
+'Description of FOI Request','Summary of request','Description','Summary',
 'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of    FOI Request',"FOI request",'Results 1 to 67 of 67']
-							if doc['title'] not in badtitles and doc['description'] != '':
+                            if doc['title'] not in badtitles\
+                            and doc['description'] != '':
                                                             print "saving"
                                                             foidocsdb.save(doc)
-						else:
-							print "already saved "+hash
-					
-					elif len(row.find_all('th')) == self.getColumnCount():
-						print "header row"
-					
-					else:
-						print "ERROR number of columns incorrect"
-						print row
-
+                        else:
+                            print "already saved " + dochash
+
+                    elif len(row.find_all('th')) is self.getColumnCount():
+                        print "header row"
+
+                    else:
+                        print "ERROR number of columns incorrect"
+                        print row
+

--- a/documents/rss.xml.php
+++ b/documents/rss.xml.php
@@ -9,12 +9,12 @@
 $TestFeed = new RSS2FeedWriter();
 //Setting the channel elements
 //Use wrapper functions for common channelelements
-$TestFeed->setTitle('Last Modified - All');
+$TestFeed->setTitle('disclosurelo.gs Newest Entries - All');
 $TestFeed->setLink('http://disclosurelo.gs/rss.xml.php');
-$TestFeed->setDescription('Latest entries');
-  $TestFeed->setChannelElement('language', 'en-us');
-  $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
-  
+$TestFeed->setDescription('disclosurelo.gs Newest Entries - All Agencies');
+$TestFeed->setChannelElement('language', 'en-us');
+$TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
+
 //Retriving informations from database
 $idtoname = Array();
 $agenciesdb = $server->get_db('disclosr-agencies');
@@ -22,18 +22,18 @@
     $idtoname[$row->id] = trim($row->value->name);
 }
 $foidocsdb = $server->get_db('disclosr-foidocuments');
-$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00', 50), true)->rows;
+$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99', '0000-00-00', 50), true)->rows;
 //print_r($rows);
 foreach ($rows as $row) {
     //Create an empty FeedItem
     $newItem = $TestFeed->createNewItem();
     //Add elements to the feed item
     $newItem->setTitle($row->value->title);
-    $newItem->setLink("view.php?id=".$row->value->_id);
-    $newItem->setDate(date("c", strtotime($row->value->date)));
-    $newItem->setDescription(displayLogEntry($row,$idtoname));
+    $newItem->setLink("http://disclosurelo.gs/view.php?id=" . $row->value->_id);
+    $newItem->setDate(strtotime($row->value->date));
+    $newItem->setDescription(displayLogEntry($row, $idtoname));
     $newItem->setAuthor($idtoname[$row->value->agencyID]);
-    $newItem->addElement('guid', $row->value->_id,array('isPermaLink'=>'true'));
+    $newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=" . $row->value->_id, array('isPermaLink' => 'true'));
     //Now add the feed item
     $TestFeed->addItem($newItem);
 }

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -8,186 +8,188 @@
 import time
 import os
 import mimetypes
-import re
 import urllib
 import urlparse
 
 def mkhash(input):
-	return hashlib.md5(input).hexdigest().encode("utf-8")
+    return hashlib.md5(input).hexdigest().encode("utf-8")
 
 def canonurl(url):
-	r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
-	if the URL looks invalid.
-	>>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
-	'http://xn--hgi.ws/'
-	"""
-	# strip spaces at the ends and ensure it's prefixed with 'scheme://'
-	url = url.strip()
-	if not url:
-		return ''
-	if not urlparse.urlsplit(url).scheme:
-		url = 'http://' + url
-
-	# turn it into Unicode
-	#try:
-	#    url = unicode(url, 'utf-8')
-	#except UnicodeDecodeError:
-	#    return ''  # bad UTF-8 chars in URL
-
-	# parse the URL into its components
-	parsed = urlparse.urlsplit(url)
-	scheme, netloc, path, query, fragment = parsed
-
-	# ensure scheme is a letter followed by letters, digits, and '+-.' chars
-	if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
-		return ''
-	scheme = str(scheme)
-
-	# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
-	match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
-	if not match:
-		return ''
-	domain, port = match.groups()
-	netloc = domain + (port if port else '')
-	netloc = netloc.encode('idna')
-
-	# ensure path is valid and convert Unicode chars to %-encoded
-	if not path:
-		path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
-	path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
-
-	# ensure query is valid
-	query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
-
-	# ensure fragment is valid
-	fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
-
-	# piece it all back together, truncating it to a maximum of 4KB
-	url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
-	return url[:4096]
+    r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
+    if the URL looks invalid.
+    >>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
+    'http://xn--hgi.ws/'
+    """
+    # strip spaces at the ends and ensure it's prefixed with 'scheme://'
+    url = url.strip()
+    if not url:
+        return ''
+    if not urlparse.urlsplit(url).scheme:
+        url = 'http://' + url
+
+    # turn it into Unicode
+    #try:
+    #    url = unicode(url, 'utf-8')
+    #except UnicodeDecodeError:
+    #    return ''  # bad UTF-8 chars in URL
+
+    # parse the URL into its components
+    parsed = urlparse.urlsplit(url)
+    scheme, netloc, path, query, fragment = parsed
+
+    # ensure scheme is a letter followed by letters, digits, and '+-.' chars
+    if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
+        return ''
+    scheme = str(scheme)
+
+    # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
+    match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
+    if not match:
+        return ''
+    domain, port = match.groups()
+    netloc = domain + (port if port else '')
+    netloc = netloc.encode('idna')
+
+    # ensure path is valid and convert Unicode chars to %-encoded
+    if not path:
+        path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
+    path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
+
+    # ensure query is valid
+    query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
+
+    # ensure fragment is valid
+    fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
+
+    # piece it all back together, truncating it to a maximum of 4KB
+    url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+    return url[:4096]
 
 def fullurl(url,href):
-	href = href.replace(" ","%20")
-	href = re.sub('#.*$','',href)
-	return urljoin(url,href)
+    href = href.replace(" ","%20")
+    href = re.sub('#.*$','',href)
+    return urljoin(url,href)
 
 #http://diveintopython.org/http_web_services/etags.html
-class NotModifiedHandler(urllib2.BaseHandler):  
-	def http_error_304(self, req, fp, code, message, headers):
-		addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
-		addinfourl.code = code
-		return addinfourl
+class NotModifiedHandler(urllib2.BaseHandler):
+    def http_error_304(self, req, fp, code, message, headers):
+        addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
+        addinfourl.code = code
+        return addinfourl
 
 def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
-	url = canonurl(url)
-	hash = mkhash(url)
-	req = urllib2.Request(url)
-	print "Fetching %s (%s)" % (url,hash)
-	if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
-		print "Not a valid HTTP url"
-		return (None,None,None)
-	doc = docsdb.get(hash) 
-	if doc == None:
-		doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
-	else:
-		if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
-			print "Uh oh, trying to scrape URL again too soon!"
-			last_attachment_fname = doc["_attachments"].keys()[-1]
-			last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-			content = last_attachment
-			return (doc['url'],doc['mime_type'],content)
-		if scrape_again == False:
-			print "Not scraping this URL again as requested"
-			return (None,None,None)
-
-	time.sleep(3) # wait 3 seconds to give webserver time to recover
-	
-	req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
-	#if there is a previous version stored in couchdb, load caching helper tags
-	if doc.has_key('etag'):
-		req.add_header("If-None-Match", doc['etag'])
-	if doc.has_key('last_modified'):
-		req.add_header("If-Modified-Since", doc['last_modified'])
-	 
-	opener = urllib2.build_opener(NotModifiedHandler())
-	try:
-		url_handle = opener.open(req)
-		doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
-		headers = url_handle.info() # the addinfourls have the .info() too
-		doc['etag'] = headers.getheader("ETag")
-		doc['last_modified'] = headers.getheader("Last-Modified") 
-		doc['date'] = headers.getheader("Date") 
-		doc['page_scraped'] = time.time() 
-		doc['web_server'] = headers.getheader("Server") 
-		doc['via'] = headers.getheader("Via") 
-		doc['powered_by'] = headers.getheader("X-Powered-By") 
-		doc['file_size'] = headers.getheader("Content-Length") 
-		content_type = headers.getheader("Content-Type")
-		if content_type != None:
-			 doc['mime_type'] = content_type.split(";")[0]
-		else:
-			 (type,encoding) = mimetypes.guess_type(url)
-			 doc['mime_type'] = type
-		if hasattr(url_handle, 'code'):
-			if url_handle.code == 304:
-				print "the web page has not been modified"
-				return (None,None,None)
-			else: 
-				content = url_handle.read()
-				docsdb.save(doc)
-				doc = docsdb.get(hash) # need to get a _rev
-				docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) 
-				return (doc['url'], doc['mime_type'], content)
-				#store as attachment epoch-filename
-				
-	except urllib2.URLError as e:
-			error = ""
-			if hasattr(e, 'reason'):
-				error = "error %s in downloading %s" % (str(e.reason), url)
-			elif hasattr(e, 'code'):
-				error = "error %s in downloading %s" % (e.code, url)
-			print error
-			doc['error'] = error
-			docsdb.save(doc)
-			return (None,None,None)
+    url = canonurl(url)
+    hash = mkhash(url)
+    req = urllib2.Request(url)
+    print "Fetching %s (%s)" % (url,hash)
+    if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
+        print "Not a valid HTTP url"
+        return (None,None,None)
+    doc = docsdb.get(hash)
+    if doc == None:
+        doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+    else:
+        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
+            print "Uh oh, trying to scrape URL again too soon!"+hash
+            last_attachment_fname = doc["_attachments"].keys()[-1]
+            last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+            content = last_attachment
+            return (doc['url'],doc['mime_type'],content)
+        if scrape_again == False:
+            print "Not scraping this URL again as requested"
+            return (None,None,None)
+
+    req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
+    #if there is a previous version stored in couchdb, load caching helper tags
+    if doc.has_key('etag'):
+        req.add_header("If-None-Match", doc['etag'])
+    if doc.has_key('last_modified'):
+        req.add_header("If-Modified-Since", doc['last_modified'])
+
+    opener = urllib2.build_opener(NotModifiedHandler())
+    try:
+        url_handle = opener.open(req)
+        doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
+        headers = url_handle.info() # the addinfourls have the .info() too
+        doc['etag'] = headers.getheader("ETag")
+        doc['last_modified'] = headers.getheader("Last-Modified")
+        doc['date'] = headers.getheader("Date")
+        doc['page_scraped'] = time.time()
+        doc['web_server'] = headers.getheader("Server")
+        doc['via'] = headers.getheader("Via")
+        doc['powered_by'] = headers.getheader("X-Powered-By")
+        doc['file_size'] = headers.getheader("Content-Length")
+        content_type = headers.getheader("Content-Type")
+        if content_type != None:
+             doc['mime_type'] = content_type.split(";")[0]
+        else:
+             (type,encoding) = mimetypes.guess_type(url)
+             doc['mime_type'] = type
+        if hasattr(url_handle, 'code'):
+            if url_handle.code == 304:
+                print "the web page has not been modified"+hash
+                last_attachment_fname = doc["_attachments"].keys()[-1]
+                last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+                content = last_attachment
+                return (doc['url'],doc['mime_type'],content)
+            else:
+                print "new webpage loaded"
+                content = url_handle.read()
+                docsdb.save(doc)
+                doc = docsdb.get(hash) # need to get a _rev
+                docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
+                return (doc['url'], doc['mime_type'], content)
+                #store as attachment epoch-filename
+
+    except urllib2.URLError as e:
+            print "error!"
+            error = ""
+            if hasattr(e, 'reason'):
+                error = "error %s in downloading %s" % (str(e.reason), url)
+            elif hasattr(e, 'code'):
+                error = "error %s in downloading %s" % (e.code, url)
+            print error
+            doc['error'] = error
+            docsdb.save(doc)
+            return (None,None,None)
 
 
 def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
-	(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
-	badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
-	if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
-		if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
-				# http://www.crummy.com/software/BeautifulSoup/documentation.html
-				soup = BeautifulSoup(content)
-				navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
-				for nav in navIDs:
-					print "Removing element", nav['id']
-					nav.extract()
-					navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
-					for nav in navClasses:
-						print "Removing element", nav['class']
-						nav.extract()
-					links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
-					linkurls = set([])
-					for link in links:
-						if link.has_key("href"):
-							if link['href'].startswith("http"):
-								# lets not do external links for now
-								# linkurls.add(link['href'])
-								None
-							if link['href'].startswith("mailto"):
-								# not http
-								None
-							if link['href'].startswith("javascript"):
-								# not http
-								None
-							else:
-								# remove anchors and spaces in urls
-								linkurls.add(fullurl(url,link['href']))
-					for linkurl in linkurls:
-							   #print linkurl
-							   scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)    
+    (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+    badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+    if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
+        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                soup = BeautifulSoup(content)
+                navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
+                for nav in navIDs:
+                    print "Removing element", nav['id']
+                    nav.extract()
+                    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
+                    for nav in navClasses:
+                        print "Removing element", nav['class']
+                        nav.extract()
+                    links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+                    linkurls = set([])
+                    for link in links:
+                        if link.has_key("href"):
+                            if link['href'].startswith("http"):
+                                # lets not do external links for now
+                                # linkurls.add(link['href'])
+                                None
+                            if link['href'].startswith("mailto"):
+                                # not http
+                                None
+                            if link['href'].startswith("javascript"):
+                                # not http
+                                None
+                            else:
+                                # remove anchors and spaces in urls
+                                linkurls.add(fullurl(url,link['href']))
+                    for linkurl in linkurls:
+                               #print linkurl
+                               scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
 couch = couchdb.Server('http://127.0.0.1:5984/')
@@ -196,20 +198,20 @@
 docsdb = couch['disclosr-documents']
 
 if __name__ == "__main__":
-	for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
-		agency = agencydb.get(row.id)
-		print agency['name']
-		for key in agency.keys():
-			if key == "FOIDocumentsURL" and "status" not in agency.keys:
-				scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-			if key == 'website' and False:
-				scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-                                agency['metadata']['lastScraped'] = time.time()
-			if key.endswith('URL') and False:
-				print key 
-				depth = 1
-				if 'scrapeDepth' in agency.keys():
-					depth = agency['scrapeDepth']
-				scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
-		agencydb.save(agency)
-
+    for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+        agency = agencydb.get(row.id)
+        print agency['name']
+        for key in agency.keys():
+            if key == "FOIDocumentsURL" and "status" not in agency.keys:
+                scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+            if key == 'website' and False:
+                scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+                agency['metadata']['lastScraped'] = time.time()
+            if key.endswith('URL') and False:
+                print key
+                depth = 1
+                if 'scrapeDepth' in agency.keys():
+                    depth = agency['scrapeDepth']
+                scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
+        agencydb.save(agency)
+

--- /dev/null
+++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-pdf
 

--- /dev/null
+++ b/documents/scrapers/00a294de663db69062ca09aede7c0487.py
@@ -1,1 +1,47 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import dateutil
+from dateutil.parser import *
+from datetime import *
 
+
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+    def getDate(self, content, entry, doc):
+        date = ''.join(entry.find('th').stripped_strings).strip()
+        (a, b, c) = date.partition("(")
+        date = self.remove_control_chars(a.replace("Octber", "October"))
+        print date
+        edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+        print edate
+        doc.update({'date': edate})
+        return
+    def getColumnCount(self):
+        return 4
+
+    def getTable(self, soup):
+        return soup.find(summary="List of Defence documents released under Freedom of Information requets")
+
+    def getColumns(self, columns):
+        (id, description, access, notes) = columns
+        return (id, None, description, description, notes)
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+
+    nsi = ScraperImplementation()
+    nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201213.cfm"
+    nsi.doScrape()
+
+    nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201112.cfm"
+    nsi.doScrape()
+
+    nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201011.cfm"
+    nsi.doScrape()
+
+

--- a/documents/scrapers/00a294de663db69062ca09aede7c0487.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-multipage
 

--- a/documents/scrapers/0ae822d1a748e60d90f0b79b97d5a3e5.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-ACMA style
 

--- /dev/null
+++ b/documents/scrapers/1803322b27286950cab0c543168b5f21.py
@@ -1,1 +1,58 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import dateutil
+from dateutil.parser import *
+from datetime import *
+import scrape
+from bs4 import BeautifulSoup
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
 
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+    def getDescription(self,content, entry,doc):
+        link = None
+        links = []
+        description = ""
+        for atag in entry.find_all('a'):
+            if atag.has_key('href'):
+                link = scrape.fullurl(self.getURL(), atag['href'])
+                (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+                if htcontent != None:
+                    if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                        soup = BeautifulSoup(htcontent)
+                        row  = soup.find(id="content_div_148050")
+                        description = ''.join(row.stripped_strings)
+                        for atag in row.find_all("a"):
+                                    if atag.has_key('href'):
+                                        links.append(scrape.fullurl(link, atag['href']))
+
+        if links != []:
+                     doc.update({'links': links})
+        if description != "":
+            doc.update({ 'description': description})
+    def getColumnCount(self):
+        return 4
+
+    def getColumns(self, columns):
+        (id, date, datepub, title) = columns
+        return (id, date, title, title, None)
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+
+    nsi = ScraperImplementation()
+    nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5"
+    nsi.doScrape()
+

--- a/documents/scrapers/1803322b27286950cab0c543168b5f21.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-multipage log
 

--- /dev/null
+++ b/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-PDF
 

--- /dev/null
+++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/8317df630946937864d31a4728ad8ee8.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-pdf
 

--- /dev/null
+++ b/documents/scrapers/8796220032faf94501bd366763263685.py
@@ -1,1 +1,37 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import dateutil
+from dateutil.parser import *
+from datetime import *
 
+
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+    def getColumnCount(self):
+        return 6
+
+    def getColumns(self, columns):
+        (id, date, title, description, datepub, notes) = columns
+        return (id, date, title, description, notes)
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+
+    nsi = ScraperImplementation()
+    nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/pmo/2011-12.cfm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/dpmc/2011-12.cfm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/dpmc/2012-13.cfm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/omsi/2011-12.cfm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/omps/2012-13.cfm"
+    nsi.doScrape()
+

--- a/documents/scrapers/8796220032faf94501bd366763263685.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-multiple pages
 

--- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
+++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py
@@ -3,7 +3,7 @@
 import genericScrapers
 import scrape
 from bs4 import BeautifulSoup
-import codecs 
+import codecs
 #http://www.doughellmann.com/PyMOTW/abc/
 class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
         def getDescription(self,content, entry,doc):
@@ -20,7 +20,7 @@
                                                 soup = BeautifulSoup(htcontent)
                                                 for text in soup.find(id="divFullWidthColumn").stripped_strings:
                                                     description = description + text.encode('ascii', 'ignore')
-                                                
+
                                                 for atag in soup.find(id="divFullWidthColumn").find_all("a"):
                                                       	if atag.has_key('href'):
                                                               	links.append(scrape.fullurl(link,atag['href']))
@@ -76,11 +76,10 @@
 if __name__ == '__main__':
     print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
     print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
-    #NewScraperImplementation().doScrape()
+    NewScraperImplementation().doScrape()
     print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
     print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
     osi = OldScraperImplementation()
     osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI"
     osi.doScrape()
-# old site too
 

--- /dev/null
+++ b/documents/scrapers/b0a3281ba66efe173c5a33d5ef90ff76.py
@@ -1,1 +1,35 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import dateutil
+from dateutil.parser import *
+from datetime import *
 
+
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+    def getColumnCount(self):
+        return 2
+
+    def getColumns(self, columns):
+        (date, title) = columns
+        return (title, date, title, title, None)
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+
+    nsi = ScraperImplementation()
+    nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2012.htm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2011.htm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2010.htm"
+    nsi.doScrape()
+    nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2009.htm"
+    nsi.doScrape()
+

--- a/documents/scrapers/b0a3281ba66efe173c5a33d5ef90ff76.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-multipage immi
 

--- /dev/null
+++ b/documents/scrapers/c57c0bf315ce5977e730905707a2f6a3.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/c57c0bf315ce5977e730905707a2f6a3.txt
+++ /dev/null
@@ -1,3 +1,1 @@
-# pdf
-http://www.awm.gov.au/about/AWM_Disclosure_Log.pdf
 

--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -98,9 +98,12 @@
                 _gaq.push(['_trackPageview']);
 
                 (function() {
-                    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+                    var ga = document.createElement('script');
+                    ga.type = 'text/javascript';
+                    ga.async = true;
                     ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
-                    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+                    var s = document.getElementsByTagName('script')[0];
+                    s.parentNode.insertBefore(ga, s);
                 })();
 
             </script>
@@ -146,9 +149,12 @@
 function displayLogEntry($row, $idtoname) {
     $result = "";
     $result .= '<div itemscope itemtype="http://schema.org/Article">';
-    $result .= '<h2> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120)."</span>";
-    $result .= '(<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</h2>';
-    $result .= "<p itemprop='description articleBody text'> Title" . $row->value->title . "<br/>" . str_replace("\n", "<br>", $row->value->description);
+    $result .= '<h2> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
+    $result .= ' (<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</h2>';
+    $result .= "<p itemprop='description articleBody text'> Title: " . $row->value->title . "<br/>";
+    if (isset($row->value->description)) {
+        $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "",trim($row->value->description)));
+    }
     if (isset($row->value->notes)) {
         $result .= " <br>Note: " . $row->value->notes;
     }
@@ -157,7 +163,7 @@
     if (isset($row->value->links)) {
         $result .= '<h3>Links/Documents</h3><ul itemprop="associatedMedia">';
         foreach ($row->value->links as $link) {
-            $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href='.$link.' itemprop="url contentURL">' . urlencode($link) . "</a></li>";
+            $result .= '<li itemscope itemtype="http://schema.org/MediaObject"><a href=' . $link . ' itemprop="url contentURL">' . urlencode($link) . "</a></li>";
         }
 
         $result .= "</ul>";

--- a/lib/FeedWriter
+++ /dev/null

--- /dev/null
+++ b/lib/FeedWriter/COPYING
@@ -1,1 +1,675 @@
-
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering a