From: Maxious Date: Thu, 29 Nov 2012 12:14:23 +0000 Subject: mo scrapers and apsc2012 imoort X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=992a7a9dd6bd073652eb6d4cdf63cd9cce9a815e --- mo scrapers and apsc2012 imoort Former-commit-id: 65baeca74e6e972c9a6b51dd8b21387e9b017d92 --- --- a/admin/exportEmployees.csv.php +++ b/admin/exportEmployees.csv.php @@ -4,7 +4,8 @@ $format = "csv"; //$format = "json"; -if (isset($_REQUEST['format'])) $format = $_REQUEST['format']; +if (isset($_REQUEST['format'])) + $format = $_REQUEST['format']; setlocale(LC_CTYPE, 'C'); if ($format == "csv") { $headers = Array("name"); @@ -21,7 +22,6 @@ if (isset($row->value->statistics->employees)) { $headers = array_unique(array_merge($headers, array_keys(object_to_array($row->value->statistics->employees)))); - } } } catch (SetteeRestClientException $e) { @@ -40,15 +40,14 @@ fputcsv($fp, $headers); } else if ($format == "json") { echo '{ - "labels" : ["' . implode('","', $headers) . '"],'.PHP_EOL; + "labels" : ["' . implode('","', $headers) . '"],' . PHP_EOL; } try { $agencies = $db->get_view("app", "all", null, true)->rows; //print_r($agencies); $first = true; if ($format == "json") { - echo '"data" : ['.PHP_EOL; - + echo '"data" : [' . PHP_EOL; } foreach ($agencies as $agency) { @@ -56,25 +55,35 @@ $row = Array(); $agencyEmployeesArray = object_to_array($agency->value->statistics->employees); foreach ($headers as $i => $fieldName) { + if ($format == "csv") { + if (isset($agencyEmployeesArray[$fieldName])) { + $row[] = $agencyEmployeesArray[$fieldName]["value"] ; + } else if ($i == 0) { + $row[] = $agency->value->name; + } else { + $row[] = 0; + } + } else if ($format == "json") { if (isset($agencyEmployeesArray[$fieldName])) { - $row[] = '['.$i.','.$agencyEmployeesArray[$fieldName]["value"].']'; + $row[] = '[' . $i . ',' . $agencyEmployeesArray[$fieldName]["value"] . ']'; } else { - $row[] = '['.$i.',0]'; + $row[] = '[' . $i . ',0]'; } + } } if ($format == "csv") { fputcsv($fp, array_values($row)); } else if ($format == "json") { - if (!$first) echo ","; - echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "'.$agency->value->name.'", "lines" : { "show" : true }, "points" : { "show" : true }}'.PHP_EOL; + if (!$first) + echo ","; + echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "' . $agency->value->name . '", "lines" : { "show" : true }, "points" : { "show" : true }}' . PHP_EOL; $first = false; } } } if ($format == "json") { - echo '] - }'.PHP_EOL; - + echo '] + }' . PHP_EOL; } } catch (SetteeRestClientException $e) { setteErrorHandler($e); --- a/admin/importAPSCEmployees.php +++ b/admin/importAPSCEmployees.php @@ -47,13 +47,17 @@ $changed = false; if (!isset($doc->statistics)) { $changed = true; - $doc->statistics = Array(); + $doc->statistics = new stdClass(); + } + if (!isset($doc->statistics->employees)) { + $changed = true; + $doc->statistics->employees = new stdClass(); } foreach ($sum as $timePeriod => $value) { if (!isset($doc->statistics->employees->$timePeriod->value) || $doc->statistics->employees->$timePeriod->value != $value) { $changed = true; - $doc->statistics["employees"][$timePeriod] = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/"); + $doc->statistics->employees->$timePeriod = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/"); } } if ($changed) { --- /dev/null +++ b/admin/importAPSCEmployees2012.php @@ -1,1 +1,86 @@ +get_db('disclosr-agencies'); +$rows = $db->get_view("app", "byName")->rows; +$nametoid = Array(); +$sums = Array(); +$functions = Array(); +foreach ($rows as $row) { + $nametoid[trim($row->key)] = $row->value; +} + + +$request = Requests::get("http://www.apsc.gov.au/publications-and-media/parliamentary/state-of-the-service/new-sosr/appendix-2-aps-agencies"); +$doc = phpQuery::newDocumentHTML($request->body); +phpQuery::selectDocument($doc); +foreach (pq('tr')->elements as $tr) { + //echo $tr->nodeValue.PHP_EOL; + $agency = ""; + $employees = ""; + $function = ""; + $i = 0; + foreach ($tr->childNodes as $td) { + //echo $td->nodeValue." $i
"; + if ($i == 0) + $agency = $td->nodeValue; + if ($i == 2) { + $employees = trim(str_replace(",", "", $td->nodeValue)); + } + if ($i == 4) { + $function = $td->nodeValue; + } + $i++; + } + if ($agency != "" && $employees != "" && $function != "") { + $name = trim(str_replace('2','',$agency)); + //echo "$name

" . PHP_EOL; + if (isset($nametoid[$name])) { + $id = $nametoid[$name]; + //echo $id . "
" . PHP_EOL; + @$sums[$id]["2011-2012"] += $employees; + $functions[$id] = $function; + } else if ($agency != "Agency"){ + echo "
ERROR NAME '$agency' MISSING FROM ID LIST

" . PHP_EOL; + + die(); + } + } else { + echo "skipped $agency"; + } +} +//print_r($sums); +foreach ($sums as $id => $sum) { + echo $id . "
" . PHP_EOL; + $doc = $db->get($id); + echo $doc->name . "
" . PHP_EOL; + // print_r($doc); + $changed = false; + if (!isset($doc->statistics)) { + $changed = true; + $doc->statistics = new stdClass(); + } + if (!isset($doc->statistics->employees)) { + $changed = true; + $doc->statistics->employees = new stdClass(); + } + foreach ($sum as $timePeriod => $value) { + if (!isset($doc->statistics->employees->$timePeriod->value) + || $doc->statistics->employees->$timePeriod->value != $value) { + $changed = true; + $doc->statistics->employees->$timePeriod = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/"); + $doc->employees = $value; + $doc->functionClassification = $functions[$id]; + } + } + + if ($changed) { + $db->save($doc); + } else { + echo "not changed" . "
" . PHP_EOL; + } +} +// employees: timeperiod, source = apsc state of service, value +?> + --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -19,46 +19,50 @@ if ($rows) { foreach ($rows as $row) { + if ((!isset($row->value->status) || $row->value->status != "suspended") && isset($row->value->foiEmail)) { + echo ""; + if (isset($row->value->website)) echo ""; + echo "" . $row->value->name . ""; + if (isset($row->value->website)) echo ""; + if ($ENV == "DEV") + echo "
(" . $row->id . ")"; + echo "\n"; + $agencies++; - echo "" . $row->value->name . ""; - if ($ENV == "DEV") - echo "
(" . $row->id . ")"; - echo "\n"; -$agencies++; - - echo ""; - if (isset($row->value->FOIDocumentsURL)) { - $disclogs++; - echo '' - . $row->value->FOIDocumentsURL . ''; - if ($ENV == "DEV") - echo '
(' - . 'view local copy)'; - } else { - echo ""; - } - echo "\n"; - if (isset($row->value->FOIDocumentsURL)) { - if (file_exists("./scrapers/" . $row->id . '.py')) { - echo ""; - $green++; - } else if (file_exists("./scrapers/" . $row->id . '.txt')) { - echo ""; - $orange++; + echo ""; + if (isset($row->value->FOIDocumentsURL)) { + $disclogs++; + echo '' + . $row->value->FOIDocumentsURL . ''; + if ($ENV == "DEV") + echo '
(' + . 'view local copy)'; } else { echo ""; - $red++; } + echo "\n"; + if (isset($row->value->FOIDocumentsURL)) { + if (file_exists("./scrapers/" . $row->id . '.py')) { + echo ""; + $green++; + } else if (file_exists("./scrapers/" . $row->id . '.txt')) { + echo ""; + $orange++; + } else { + echo ""; + $red++; + } + } + echo "\n"; } - echo "\n"; } } } catch (SetteeRestClientException $e) { setteErrorHandler($e); } echo ""; -echo $agencies." agencies, ".round(($disclogs/$agencies)*100)."% with disclosure logs; " -.round(($green/$disclogs)*100)."% logs with scrapers ".round(($red/$disclogs)*100)."% logs without scrapers ".round(($orange/$disclogs)*100)."% logs Work-In-Progress scrapers "; +echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; " + . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers "; include_footer_documents(); ?> --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -92,7 +92,8 @@ return table.find_all('tr') def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() - date = str.replace("Octber","October",date) + (a,b,c) = date.partition("(") + date = a.replace("Octber","October") print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate --- /dev/null +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.txt @@ -1,1 +1,2 @@ +pdf --- /dev/null +++ b/documents/scrapers/0372b19123076338d483f624c433727b.txt @@ -1,1 +1,2 @@ +docx --- /dev/null +++ b/documents/scrapers/0ae822d1a748e60d90f0b79b97d5a3e5.txt @@ -1,1 +1,2 @@ +ACMA style --- /dev/null +++ b/documents/scrapers/0ced9dd2de36100c3cabdb7fd8e843a9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/1d404c4934f74feacd00dcb434e7c10a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "cphMain_C001_Col01").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "inner_content") + def getColumnCount(self): + return 2 + def getColumns(self,columns): + (date, title) = columns + return (date, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/2cac2cd1f42687db2d04fa20b5b6a538.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, title, date) = columns + return (id, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/31685505438d393f45a90f442b8fa27f.txt @@ -1,1 +1,2 @@ +pdf --- /dev/null +++ b/documents/scrapers/3e2f110af49d62833a835bd257771ffb.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/4c57389dda9bd454bcb08bc1e5ed87bf.txt @@ -1,1 +1,2 @@ +parent --- /dev/null +++ b/documents/scrapers/4d2af2dcc72f1703bbf04b13b03720a8.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/525c3953187da08cd702359b2fc2997f.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/54cbb3439276062b7a9f007f9f69d1f6.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/601aedeef4344638d635bdd761e9fdba.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (date, title, description,notes) = columns + return (title, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/655d4d67333536bda18d68265dfe7e80.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="node-30609") + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/656f7bb1884f4b9d31ebe2a5f5f58064.txt @@ -1,1 +1,2 @@ +list style --- /dev/null +++ b/documents/scrapers/65ec17101b00519e6d88c5a9f33c2c46.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, date, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/69d59284ef0ccd2677394d82d3292abc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "centercontent").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6ac74a939f420c6194ae29224809734a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.txt @@ -1,1 +1,2 @@ +PDF --- /dev/null +++ b/documents/scrapers/72a295f10734d64e8185f651fd2b39ea.txt @@ -1,1 +1,2 @@ +weird div based log with tables of links --- /dev/null +++ b/documents/scrapers/75d8f1c605ef9da0c2590264b7aa046b.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "content-middle").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/768bbbfb34115873af361af8519b38a9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/794ea270edc9aa4f70f2a84bbc5ecc7a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "cphMain_C001_Col01").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/795e7a8afb39a420360aa207b0cb1306.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/7b39ce7f362a0af9a711eaf223943eea.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/7ec28d7d97fcf493b1350acd03e3642e.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/7f55a3c42ad7460254906aa043a6e324.py @@ -1,1 +1,24 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getTitle(self, content, entry, doc): + doc.update({'title': content.stripped_strings.next()}) + return + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, id, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.txt @@ -1,1 +1,2 @@ +pdf --- /dev/null +++ b/documents/scrapers/8796220032faf94501bd366763263685.txt @@ -1,1 +1,2 @@ +multiple pages --- /dev/null +++ b/documents/scrapers/8aae1c28db7f3ce10f232a0137be6bb2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt @@ -1,1 +1,49 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): + if row != None: + rowtitle = row.find('th').string + description = description + "\n" + rowtitle + ": " + for text in row.find('td').stripped_strings: + description = description + text + for atag in row.find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(class_ = "ms-rteTable-GreyAlternating") + def getColumns(self,columns): + (date, title) = columns + return (title, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() +# old site too http://archive.treasury.gov.au/content/foi_publications.asp + --- /dev/null +++ b/documents/scrapers/9282306e244040c9e4ae5705f06f9548.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/93ce83e46f5c2c4ca1b7f199b59b4bd2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date,logdate, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/99328d76c8efb56ff3f1da79b9d1b17f.txt @@ -1,1 +1,2 @@ +acma style --- /dev/null +++ b/documents/scrapers/9961dc45e046288ad1431941653af20c.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/9f4815bfdcb918a036e4bb43a30f8d77.txt @@ -1,1 +1,1 @@ - +no disclog --- /dev/null +++ b/documents/scrapers/a1ab9c80ab473958676c62c1a25dd502.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a43467fe82b840a353b380c4d7462a4c.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (blank,id, title,date) = columns + return (id, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b0fb402314e685238537105ee0e70c84.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b7770c4584332cff42bb6abb3326e564.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_Content__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b91f866928eb61959dbbab56313214fc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bc91b878e2317fa231cc2c512e2027f0.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bf16d4ba0d306ee03e5a1d32aaba3da1.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(summary="This table shows every FOI request to date.") + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/cca17a34bd490474a316fe0a1ca03c25.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/cde8eb4a2e40abb18d8b28d3b85bc9b0.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(summary="This table lists the schedule of upcoming courses.") + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ce34d1e9b55911e4272d2d388821f311.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/d1296c366287f7a9faedf235c7e6df01.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="main").table + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e0614dc3a9e25d375370ffd82f7165ac.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description,deldate, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e64c71f4986f78675a252104c5a5f359.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt @@ -1,1 +1,2 @@ +c'est ne pas une table --- /dev/null +++ b/documents/scrapers/e90b1b7cbb83e3eed0b5f849c7e3af79.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "inner_content") + def getColumnCount(self): + return 2 + def getColumns(self,columns): + (date, title) = columns + return (date, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ee30aad97f0bb32e74c4587404b67ce4.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, title, date, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/f189459fc43f941e0d4ecfba52c666f3.txt @@ -1,1 +1,2 @@ +no disclog --- /dev/null +++ b/documents/scrapers/f5ce2d1651739704634eb8ca4b2b46d3.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_PublishingPageContent__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() +