|
import sys,os |
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) |
|
import genericScrapers |
|
import dateutil |
|
from dateutil.parser import * |
|
from datetime import * |
|
|
|
|
|
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): |
|
|
|
def __init__(self): |
|
super(ScraperImplementation, self).__init__() |
|
def getDate(self, content, entry, doc): |
|
date = ''.join(entry.find('th').stripped_strings).strip() |
|
(a, b, c) = date.partition("(") |
|
date = self.remove_control_chars(a.replace("Octber", "October")) |
|
print date |
|
edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") |
|
print edate |
|
doc.update({'date': edate}) |
|
return |
|
def getColumnCount(self): |
|
return 4 |
|
|
|
def getTable(self, soup): |
|
return soup.find(summary="List of Defence documents released under Freedom of Information requets") |
|
|
|
def getColumns(self, columns): |
|
(id, description, access, notes) = columns |
|
return (id, None, description, description, notes) |
|
|
|
|
|
if __name__ == '__main__': |
|
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) |
|
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) |
|
|
|
nsi = ScraperImplementation() |
|
nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201213.cfm" |
|
nsi.doScrape() |
|
|
|
nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201112.cfm" |
|
nsi.doScrape() |
|
|
|
nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201011.cfm" |
|
nsi.doScrape() |
|
|
|
|