From: Maxious <maxious@lambdacomplex.org>
Date: Mon, 03 Dec 2012 09:22:28 +0000
Subject: pdf scrapers
X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=8c65f1f2248f9f6748c09a69638e43e9389888be
---
pdf scrapers


Former-commit-id: 6a33167b9cf20ed9af0d41f252320532648535db
---


--- /dev/null
+++ b/documents/disclosr-documents.nja
@@ -1,1 +1,7 @@
-
+{
+  "venv": "", 
+  "project-type": "Import from sources", 
+  "name": "disclosr-documents", 
+  "license": "GNU General Public License v3", 
+  "description": ""
+}

--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -15,10 +15,6 @@
 
 from StringIO import StringIO
 
-from docx import *
-from lxml import etree
-import zipfile
-
 from pdfminer.pdfparser import PDFDocument, PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
 from pdfminer.pdfdevice import PDFDevice, TagExtractor
@@ -39,14 +35,14 @@
         """ disclosr agency id """
         if self.agencyID is None:
             self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
-            return self.agencyID
+        return self.agencyID
 
     def getURL(self):
         """ disclog URL"""
         if self.disclogURL is None:
             agency = scrape.agencydb.get(self.getAgencyID())
             self.disclogURL = agency['FOIDocumentsURL']
-            return self.disclogURL
+        return self.disclogURL
 
     @abc.abstractmethod
     def doScrape(self):
@@ -62,14 +58,15 @@
              self.getURL(), "foidocuments", self.getAgencyID())
         laparams = LAParams()
         rsrcmgr = PDFResourceManager(caching=True)
-        outfp = StringIO.StringIO()
+        outfp = StringIO()
         device = TextConverter(rsrcmgr, outfp, codec='utf-8',
              laparams=laparams)
-        fp = StringIO.StringIO()
-        fp.write(content)
-        description = output.getvalue()
+        fp = StringIO()
+        fp.write(content.read())
+
         process_pdf(rsrcmgr, device, fp, set(), caching=True,
              check_extractable=True)
+        description = outfp.getvalue()
         fp.close()
         device.close()
         outfp.close()
@@ -77,11 +74,10 @@
         doc = foidocsdb.get(dochash)
         if doc is None:
             print "saving " + dochash
-            edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
+            edate = date.today().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated"}
-            self.getDescription(entry, entry, doc)
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"
@@ -103,17 +99,16 @@
         for paratext in paratextlist:
             newparatextlist.append(paratext.encode("utf-8"))
         ## Print our documnts test with two newlines under each paragraph
-        description = '\n\n'.join(newparatextlist)
+        description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
         dochash = scrape.mkhash(description)
         doc = foidocsdb.get(dochash)
 
         if doc is None:
             print "saving " + dochash
-            edate = datetime.fromtimestamp(mktime()).strftime("%Y-%m-%d")
+            edate = time().strftime("%Y-%m-%d")
             doc = {'_id': dochash, 'agencyID': self.getAgencyID()
             , 'url': self.getURL(), 'docID': dochash,
-            "date": edate, "title": "Disclosure Log Updated"}
-            self.getDescription(entry, entry, doc)
+            "date": edate, "title": "Disclosure Log Updated", "description": description}
             foidocsdb.save(doc)
         else:
             print "already saved"

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -8,186 +8,188 @@
 import time
 import os
 import mimetypes
-import re
 import urllib
 import urlparse
 
 def mkhash(input):
-	return hashlib.md5(input).hexdigest().encode("utf-8")
+    return hashlib.md5(input).hexdigest().encode("utf-8")
 
 def canonurl(url):
-	r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
-	if the URL looks invalid.
-	>>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
-	'http://xn--hgi.ws/'
-	"""
-	# strip spaces at the ends and ensure it's prefixed with 'scheme://'
-	url = url.strip()
-	if not url:
-		return ''
-	if not urlparse.urlsplit(url).scheme:
-		url = 'http://' + url
-
-	# turn it into Unicode
-	#try:
-	#    url = unicode(url, 'utf-8')
-	#except UnicodeDecodeError:
-	#    return ''  # bad UTF-8 chars in URL
-
-	# parse the URL into its components
-	parsed = urlparse.urlsplit(url)
-	scheme, netloc, path, query, fragment = parsed
-
-	# ensure scheme is a letter followed by letters, digits, and '+-.' chars
-	if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
-		return ''
-	scheme = str(scheme)
-
-	# ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
-	match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
-	if not match:
-		return ''
-	domain, port = match.groups()
-	netloc = domain + (port if port else '')
-	netloc = netloc.encode('idna')
-
-	# ensure path is valid and convert Unicode chars to %-encoded
-	if not path:
-		path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
-	path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
-
-	# ensure query is valid
-	query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
-
-	# ensure fragment is valid
-	fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
-
-	# piece it all back together, truncating it to a maximum of 4KB
-	url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
-	return url[:4096]
+    r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
+    if the URL looks invalid.
+    >>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
+    'http://xn--hgi.ws/'
+    """
+    # strip spaces at the ends and ensure it's prefixed with 'scheme://'
+    url = url.strip()
+    if not url:
+        return ''
+    if not urlparse.urlsplit(url).scheme:
+        url = 'http://' + url
+
+    # turn it into Unicode
+    #try:
+    #    url = unicode(url, 'utf-8')
+    #except UnicodeDecodeError:
+    #    return ''  # bad UTF-8 chars in URL
+
+    # parse the URL into its components
+    parsed = urlparse.urlsplit(url)
+    scheme, netloc, path, query, fragment = parsed
+
+    # ensure scheme is a letter followed by letters, digits, and '+-.' chars
+    if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
+        return ''
+    scheme = str(scheme)
+
+    # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
+    match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
+    if not match:
+        return ''
+    domain, port = match.groups()
+    netloc = domain + (port if port else '')
+    netloc = netloc.encode('idna')
+
+    # ensure path is valid and convert Unicode chars to %-encoded
+    if not path:
+        path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
+    path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
+
+    # ensure query is valid
+    query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
+
+    # ensure fragment is valid
+    fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
+
+    # piece it all back together, truncating it to a maximum of 4KB
+    url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+    return url[:4096]
 
 def fullurl(url,href):
-	href = href.replace(" ","%20")
-	href = re.sub('#.*$','',href)
-	return urljoin(url,href)
+    href = href.replace(" ","%20")
+    href = re.sub('#.*$','',href)
+    return urljoin(url,href)
 
 #http://diveintopython.org/http_web_services/etags.html
-class NotModifiedHandler(urllib2.BaseHandler):  
-	def http_error_304(self, req, fp, code, message, headers):
-		addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
-		addinfourl.code = code
-		return addinfourl
+class NotModifiedHandler(urllib2.BaseHandler):
+    def http_error_304(self, req, fp, code, message, headers):
+        addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url())
+        addinfourl.code = code
+        return addinfourl
 
 def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True):
-	url = canonurl(url)
-	hash = mkhash(url)
-	req = urllib2.Request(url)
-	print "Fetching %s (%s)" % (url,hash)
-	if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
-		print "Not a valid HTTP url"
-		return (None,None,None)
-	doc = docsdb.get(hash) 
-	if doc == None:
-		doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
-	else:
-		if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
-			print "Uh oh, trying to scrape URL again too soon!"
-			last_attachment_fname = doc["_attachments"].keys()[-1]
-			last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
-			content = last_attachment
-			return (doc['url'],doc['mime_type'],content)
-		if scrape_again == False:
-			print "Not scraping this URL again as requested"
-			return (None,None,None)
-
-	time.sleep(3) # wait 3 seconds to give webserver time to recover
-	
-	req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
-	#if there is a previous version stored in couchdb, load caching helper tags
-	if doc.has_key('etag'):
-		req.add_header("If-None-Match", doc['etag'])
-	if doc.has_key('last_modified'):
-		req.add_header("If-Modified-Since", doc['last_modified'])
-	 
-	opener = urllib2.build_opener(NotModifiedHandler())
-	try:
-		url_handle = opener.open(req)
-		doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
-		headers = url_handle.info() # the addinfourls have the .info() too
-		doc['etag'] = headers.getheader("ETag")
-		doc['last_modified'] = headers.getheader("Last-Modified") 
-		doc['date'] = headers.getheader("Date") 
-		doc['page_scraped'] = time.time() 
-		doc['web_server'] = headers.getheader("Server") 
-		doc['via'] = headers.getheader("Via") 
-		doc['powered_by'] = headers.getheader("X-Powered-By") 
-		doc['file_size'] = headers.getheader("Content-Length") 
-		content_type = headers.getheader("Content-Type")
-		if content_type != None:
-			 doc['mime_type'] = content_type.split(";")[0]
-		else:
-			 (type,encoding) = mimetypes.guess_type(url)
-			 doc['mime_type'] = type
-		if hasattr(url_handle, 'code'):
-			if url_handle.code == 304:
-				print "the web page has not been modified"
-				return (None,None,None)
-			else: 
-				content = url_handle.read()
-				docsdb.save(doc)
-				doc = docsdb.get(hash) # need to get a _rev
-				docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) 
-				return (doc['url'], doc['mime_type'], content)
-				#store as attachment epoch-filename
-				
-	except urllib2.URLError as e:
-			error = ""
-			if hasattr(e, 'reason'):
-				error = "error %s in downloading %s" % (str(e.reason), url)
-			elif hasattr(e, 'code'):
-				error = "error %s in downloading %s" % (e.code, url)
-			print error
-			doc['error'] = error
-			docsdb.save(doc)
-			return (None,None,None)
+    url = canonurl(url)
+    hash = mkhash(url)
+    req = urllib2.Request(url)
+    print "Fetching %s (%s)" % (url,hash)
+    if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "":
+        print "Not a valid HTTP url"
+        return (None,None,None)
+    doc = docsdb.get(hash)
+    if doc == None:
+        doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName}
+    else:
+        if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000):
+            print "Uh oh, trying to scrape URL again too soon!"+hash
+            last_attachment_fname = doc["_attachments"].keys()[-1]
+            last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+            content = last_attachment
+            return (doc['url'],doc['mime_type'],content)
+        if scrape_again == False:
+            print "Not scraping this URL again as requested"
+            return (None,None,None)
+
+    req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)")
+    #if there is a previous version stored in couchdb, load caching helper tags
+    if doc.has_key('etag'):
+        req.add_header("If-None-Match", doc['etag'])
+    if doc.has_key('last_modified'):
+        req.add_header("If-Modified-Since", doc['last_modified'])
+
+    opener = urllib2.build_opener(NotModifiedHandler())
+    try:
+        url_handle = opener.open(req)
+        doc['url'] = url_handle.geturl() # may have followed a redirect to a new url
+        headers = url_handle.info() # the addinfourls have the .info() too
+        doc['etag'] = headers.getheader("ETag")
+        doc['last_modified'] = headers.getheader("Last-Modified")
+        doc['date'] = headers.getheader("Date")
+        doc['page_scraped'] = time.time()
+        doc['web_server'] = headers.getheader("Server")
+        doc['via'] = headers.getheader("Via")
+        doc['powered_by'] = headers.getheader("X-Powered-By")
+        doc['file_size'] = headers.getheader("Content-Length")
+        content_type = headers.getheader("Content-Type")
+        if content_type != None:
+             doc['mime_type'] = content_type.split(";")[0]
+        else:
+             (type,encoding) = mimetypes.guess_type(url)
+             doc['mime_type'] = type
+        if hasattr(url_handle, 'code'):
+            if url_handle.code == 304:
+                print "the web page has not been modified"+hash
+                last_attachment_fname = doc["_attachments"].keys()[-1]
+                last_attachment = docsdb.get_attachment(doc,last_attachment_fname)
+                content = last_attachment
+                return (doc['url'],doc['mime_type'],content)
+            else:
+                print "new webpage loaded"
+                content = url_handle.read()
+                docsdb.save(doc)
+                doc = docsdb.get(hash) # need to get a _rev
+                docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type'])
+                return (doc['url'], doc['mime_type'], content)
+                #store as attachment epoch-filename
+
+    except urllib2.URLError as e:
+            print "error!"
+            error = ""
+            if hasattr(e, 'reason'):
+                error = "error %s in downloading %s" % (str(e.reason), url)
+            elif hasattr(e, 'code'):
+                error = "error %s in downloading %s" % (e.code, url)
+            print error
+            doc['error'] = error
+            docsdb.save(doc)
+            return (None,None,None)
 
 
 def scrapeAndStore(docsdb, url, depth, fieldName, agencyID):
-	(url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
-	badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
-	if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
-		if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
-				# http://www.crummy.com/software/BeautifulSoup/documentation.html
-				soup = BeautifulSoup(content)
-				navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
-				for nav in navIDs:
-					print "Removing element", nav['id']
-					nav.extract()
-					navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
-					for nav in navClasses:
-						print "Removing element", nav['class']
-						nav.extract()
-					links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
-					linkurls = set([])
-					for link in links:
-						if link.has_key("href"):
-							if link['href'].startswith("http"):
-								# lets not do external links for now
-								# linkurls.add(link['href'])
-								None
-							if link['href'].startswith("mailto"):
-								# not http
-								None
-							if link['href'].startswith("javascript"):
-								# not http
-								None
-							else:
-								# remove anchors and spaces in urls
-								linkurls.add(fullurl(url,link['href']))
-					for linkurl in linkurls:
-							   #print linkurl
-							   scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)    
+    (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID)
+    badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"]
+    if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report":
+        if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+                # http://www.crummy.com/software/BeautifulSoup/documentation.html
+                soup = BeautifulSoup(content)
+                navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header'))
+                for nav in navIDs:
+                    print "Removing element", nav['id']
+                    nav.extract()
+                    navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')})
+                    for nav in navClasses:
+                        print "Removing element", nav['class']
+                        nav.extract()
+                    links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-"))
+                    linkurls = set([])
+                    for link in links:
+                        if link.has_key("href"):
+                            if link['href'].startswith("http"):
+                                # lets not do external links for now
+                                # linkurls.add(link['href'])
+                                None
+                            if link['href'].startswith("mailto"):
+                                # not http
+                                None
+                            if link['href'].startswith("javascript"):
+                                # not http
+                                None
+                            else:
+                                # remove anchors and spaces in urls
+                                linkurls.add(fullurl(url,link['href']))
+                    for linkurl in linkurls:
+                               #print linkurl
+                               scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
 couch = couchdb.Server('http://127.0.0.1:5984/')
@@ -196,20 +198,20 @@
 docsdb = couch['disclosr-documents']
 
 if __name__ == "__main__":
-	for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
-		agency = agencydb.get(row.id)
-		print agency['name']
-		for key in agency.keys():
-			if key == "FOIDocumentsURL" and "status" not in agency.keys:
-				scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-			if key == 'website' and False:
-				scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
-                                agency['metadata']['lastScraped'] = time.time()
-			if key.endswith('URL') and False:
-				print key 
-				depth = 1
-				if 'scrapeDepth' in agency.keys():
-					depth = agency['scrapeDepth']
-				scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
-		agencydb.save(agency)
-
+    for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view?
+        agency = agencydb.get(row.id)
+        print agency['name']
+        for key in agency.keys():
+            if key == "FOIDocumentsURL" and "status" not in agency.keys:
+                scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+            if key == 'website' and False:
+                scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+                agency['metadata']['lastScraped'] = time.time()
+            if key.endswith('URL') and False:
+                print key
+                depth = 1
+                if 'scrapeDepth' in agency.keys():
+                    depth = agency['scrapeDepth']
+                scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
+        agencydb.save(agency)
+

--- /dev/null
+++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-pdf
 

--- /dev/null
+++ b/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-PDF
 

--- /dev/null
+++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/8317df630946937864d31a4728ad8ee8.txt
+++ /dev/null
@@ -1,2 +1,1 @@
-pdf
 

--- /dev/null
+++ b/documents/scrapers/c57c0bf315ce5977e730905707a2f6a3.py
@@ -1,1 +1,19 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
 
+
+class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
+
+    def __init__(self):
+        super(ScraperImplementation, self).__init__()
+
+
+if __name__ == '__main__':
+    print 'Subclass:', issubclass(ScraperImplementation,
+         genericScrapers.GenericOAICDisclogScraper)
+    print 'Instance:', isinstance(ScraperImplementation(),
+         genericScrapers.GenericOAICDisclogScraper)
+    ScraperImplementation().doScrape()
+

--- a/documents/scrapers/c57c0bf315ce5977e730905707a2f6a3.txt
+++ /dev/null
@@ -1,3 +1,1 @@
-# pdf
-http://www.awm.gov.au/about/AWM_Disclosure_Log.pdf
 

--- a/documents/template.inc.php
+++ b/documents/template.inc.php
@@ -150,10 +150,10 @@
     $result = "";
     $result .= '<div itemscope itemtype="http://schema.org/Article">';
     $result .= '<h2> <span itemprop="datePublished">' . $row->value->date . "</span>: <span itemprop='name headline'>" . truncate($row->value->title, 120) . "</span>";
-    $result .= '(<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</h2>';
-    $result .= "<p itemprop='description articleBody text'> Title" . $row->value->title . "<br/>";
+    $result .= ' (<span itemprop="author publisher creator">' . $idtoname[$row->value->agencyID] . '</span>)</h2>';
+    $result .= "<p itemprop='description articleBody text'> Title: " . $row->value->title . "<br/>";
     if (isset($row->value->description)) {
-        $result .= str_replace("\n", "<br>", $row->value->description);
+        $result .= str_replace("\n", "<br>", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "",trim($row->value->description)));
     }
     if (isset($row->value->notes)) {
         $result .= " <br>Note: " . $row->value->notes;