beginning datagov scraper
Former-commit-id: a8775a64a3cdda480e4433742ed7ea6ca6a437ef
--- a/documents/404.html
+++ b/documents/404.html
@@ -1,44 +1,156 @@
<!doctype html>
<html lang="en">
<head>
- <meta charset="utf-8">
- <title>Page Not Found :(</title>
- <style>
- ::-moz-selection { background: #fe57a1; color: #fff; text-shadow: none; }
- ::selection { background: #fe57a1; color: #fff; text-shadow: none; }
- html { padding: 30px 10px; font-size: 20px; line-height: 1.4; color: #737373; background: #f0f0f0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
- html, input { font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; }
- body { max-width: 500px; _width: 500px; padding: 30px 20px 50px; border: 1px solid #b3b3b3; border-radius: 4px; margin: 0 auto; box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff; background: #fcfcfc; }
- h1 { margin: 0 10px; font-size: 50px; text-align: center; }
- h1 span { color: #bbb; }
- h3 { margin: 1.5em 0 0.5em; }
- p { margin: 1em 0; }
- ul { padding: 0 0 0 40px; margin: 1em 0; }
- .container { max-width: 380px; _width: 380px; margin: 0 auto; }
- /* google search */
- #goog-fixurl ul { list-style: none; padding: 0; margin: 0; }
- #goog-fixurl form { margin: 0; }
- #goog-wm-qt, #goog-wm-sb { border: 1px solid #bbb; font-size: 16px; line-height: normal; vertical-align: top; color: #444; border-radius: 2px; }
- #goog-wm-qt { width: 220px; height: 20px; padding: 5px; margin: 5px 10px 0 0; box-shadow: inset 0 1px 1px #ccc; }
- #goog-wm-sb { display: inline-block; height: 32px; padding: 0 10px; margin: 5px 0 0; white-space: nowrap; cursor: pointer; background-color: #f5f5f5; background-image: -webkit-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -moz-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -ms-linear-gradient(rgba(255,255,255,0), #f1f1f1); background-image: -o-linear-gradient(rgba(255,255,255,0), #f1f1f1); -webkit-appearance: none; -moz-appearance: none; appearance: none; *overflow: visible; *display: inline; *zoom: 1; }
- #goog-wm-sb:hover, #goog-wm-sb:focus { border-color: #aaa; box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1); background-color: #f8f8f8; }
- #goog-wm-qt:focus, #goog-wm-sb:focus { border-color: #105cb6; outline: 0; color: #222; }
- input::-moz-focus-inner { padding: 0; border: 0; }
- </style>
+ <meta charset="utf-8">
+ <title>Page Not Found :(</title>
+ <style>
+ ::-moz-selection {
+ background: #fe57a1;
+ color: #fff;
+ text-shadow: none;
+ }
+
+ ::selection {
+ background: #fe57a1;
+ color: #fff;
+ text-shadow: none;
+ }
+
+ html {
+ padding: 30px 10px;
+ font-size: 20px;
+ line-height: 1.4;
+ color: #737373;
+ background: #f0f0f0;
+ -webkit-text-size-adjust: 100%;
+ -ms-text-size-adjust: 100%;
+ }
+
+ html, input {
+ font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
+ }
+
+ body {
+ max-width: 500px;
+ _width: 500px;
+ padding: 30px 20px 50px;
+ border: 1px solid #b3b3b3;
+ border-radius: 4px;
+ margin: 0 auto;
+ box-shadow: 0 1px 10px #a7a7a7, inset 0 1px 0 #fff;
+ background: #fcfcfc;
+ }
+
+ h1 {
+ margin: 0 10px;
+ font-size: 50px;
+ text-align: center;
+ }
+
+ h1 span {
+ color: #bbb;
+ }
+
+ h3 {
+ margin: 1.5em 0 0.5em;
+ }
+
+ p {
+ margin: 1em 0;
+ }
+
+ ul {
+ padding: 0 0 0 40px;
+ margin: 1em 0;
+ }
+
+ .container {
+ max-width: 380px;
+ _width: 380px;
+ margin: 0 auto;
+ }
+
+ /* google search */
+ #goog-fixurl ul {
+ list-style: none;
+ padding: 0;
+ margin: 0;
+ }
+
+ #goog-fixurl form {
+ margin: 0;
+ }
+
+ #goog-wm-qt, #goog-wm-sb {
+ border: 1px solid #bbb;
+ font-size: 16px;
+ line-height: normal;
+ vertical-align: top;
+ color: #444;
+ border-radius: 2px;
+ }
+
+ #goog-wm-qt {
+ width: 220px;
+ height: 20px;
+ padding: 5px;
+ margin: 5px 10px 0 0;
+ box-shadow: inset 0 1px 1px #ccc;
+ }
+
+ #goog-wm-sb {
+ display: inline-block;
+ height: 32px;
+ padding: 0 10px;
+ margin: 5px 0 0;
+ white-space: nowrap;
+ cursor: pointer;
+ background-color: #f5f5f5;
+ background-image: -webkit-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ background-image: -moz-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ background-image: -ms-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ background-image: -o-linear-gradient(rgba(255, 255, 255, 0), #f1f1f1);
+ -webkit-appearance: none;
+ -moz-appearance: none;
+ appearance: none;
+ *overflow: visible;
+ *display: inline;
+ *zoom: 1;
+ }
+
+ #goog-wm-sb:hover, #goog-wm-sb:focus {
+ border-color: #aaa;
+ box-shadow: 0 1px 1px rgba(0, 0, 0, 0.1);
+ background-color: #f8f8f8;
+ }
+
+ #goog-wm-qt:focus, #goog-wm-sb:focus {
+ border-color: #105cb6;
+ outline: 0;
+ color: #222;
+ }
+
+ input::-moz-focus-inner {
+ padding: 0;
+ border: 0;
+ }
+ </style>
</head>
<body>
- <div class="container">
+<div class="container">
<h1>Not found <span>:(</span></h1>
+
<p>Sorry, but the page you were trying to view does not exist.</p>
+
<p>It looks like this was the result of either:</p>
<ul>
- <li>a mistyped address</li>
- <li>an out-of-date link</li>
+ <li>a mistyped address</li>
+ <li>an out-of-date link</li>
</ul>
<script>
- var GOOG_FIXURL_LANG = (navigator.language || '').slice(0,2),GOOG_FIXURL_SITE = location.host;
+ var GOOG_FIXURL_LANG = (navigator.language || '').slice(0, 2), GOOG_FIXURL_SITE = location.host;
</script>
<script src="http://linkhelp.clients.google.com/tbproxy/lh/wm/fixurl.js"></script>
- </div>
+</div>
--- a/documents/agency.php
+++ b/documents/agency.php
@@ -12,8 +12,11 @@
include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency'));
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+ <div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act
+ in one place!
+ </div>
+ <a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+ <br>
<?php
try {
if ($_REQUEST['id']) {
--- a/documents/charts.php
+++ b/documents/charts.php
@@ -18,144 +18,145 @@
<div id="bydate" style="width:1000px;height:300px;"></div>
<div id="byagency" style="width:1200px;height:300px;"></div>
<script id="source">
- window.onload = function() {
- $(document).ready(function() {
- var
- d1 = [],
- options1,
- o1;
+ window.onload = function () {
+ $(document).ready(function () {
+ var
+ d1 = [],
+ options1,
+ o1;
-<?php
- try {
- $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
+ <?php
+ try {
+ $rows = $foidocsdb->get_view("app", "byDateMonthYear?group=true",null, false,false,true)->rows;
- $dataValues = Array();
- foreach ($rows as $row) {
- $dataValues[$row->key] = $row->value;
- }
- $i = 0;
- ksort($dataValues);
- foreach ($dataValues as $key => $value) {
-$date = date_create_from_format('Y-m-d', $key);
-if (date_format($date, 'U') != "") {
- echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
-// echo " emplabels.push('$key');" . PHP_EOL;
- $i++;
-}
- }
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
- ?>
+ $dataValues = Array();
+ foreach ($rows as $row) {
+ $dataValues[$row->key] = $row->value;
+ }
+ $i = 0;
+ ksort($dataValues);
+ foreach ($dataValues as $key => $value) {
+ $date = date_create_from_format('Y-m-d', $key);
+ if (date_format($date, 'U') != "") {
+ echo " d1.push([".date_format($date, 'U')."000, $value]);" . PHP_EOL;
+ // echo " emplabels.push('$key');" . PHP_EOL;
+ $i++;
+ }
+ }
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
+ }
+ ?>
-
- options1 = {
- xaxis : {
- mode : 'time',
- labelsAngle : 45
- },
- selection : {
- mode : 'x'
- },
- HtmlText : false,
- title : 'Time'
- };
-
- // Draw graph with default options, overwriting with passed options
- function drawGraph (opts) {
+ options1 = {
+ xaxis: {
+ mode: 'time',
+ labelsAngle: 45
+ },
+ selection: {
+ mode: 'x'
+ },
+ HtmlText: false,
+ title: 'Time'
+ };
- // Clone the options, so the 'options' variable always keeps intact.
- o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
+ // Draw graph with default options, overwriting with passed options
+ function drawGraph(opts) {
- // Return a new graph.
- return Flotr.draw(
- document.getElementById("bydate"),
- [ d1 ],
- o1
- );
- }
+ // Clone the options, so the 'options' variable always keeps intact.
+ o1 = Flotr._.extend(Flotr._.clone(options1), opts || {});
- graph = drawGraph();
-
- Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function(area){
- // Draw selected area
- graph = drawGraph({
- xaxis : { min : area.x1, max : area.x2, mode : 'time', labelsAngle : 45 },
- yaxis : { min : area.y1, max : area.y2 }
- });
- });
-
- // When graph is clicked, draw the graph with default area.
- Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () { graph = drawGraph(); });
+ // Return a new graph.
+ return Flotr.draw(
+ document.getElementById("bydate"),
+ [ d1 ],
+ o1
+ );
+ }
+
+ graph = drawGraph();
+
+ Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:select', function (area) {
+ // Draw selected area
+ graph = drawGraph({
+ xaxis: { min: area.x1, max: area.x2, mode: 'time', labelsAngle: 45 },
+ yaxis: { min: area.y1, max: area.y2 }
+ });
+ });
+
+ // When graph is clicked, draw the graph with default area.
+ Flotr.EventAdapter.observe(document.getElementById("bydate"), 'flotr:click', function () {
+ graph = drawGraph();
+ });
});
-};
+ };
-var d2 = [];
-var agencylabels = [];
-function agencytrackformatter(obj) {
-
- return agencylabels[Math.floor(obj.x)] +" = "+obj.y;
-
- }
- function agencytickformatter(val, axis) {
- if (agencylabels[Math.floor(val)]) {
- return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">'+(agencylabels[Math.floor(val)])+"</b>";
-
- } else {
- return "";
- }
- }
-<?php
- try {
- $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
+ var d2 = [];
+ var agencylabels = [];
+ function agencytrackformatter(obj) {
+
+ return agencylabels[Math.floor(obj.x)] + " = " + obj.y;
+
+ }
+ function agencytickformatter(val, axis) {
+ if (agencylabels[Math.floor(val)]) {
+ return '<p style="margin-top:8em;-webkit-transform:rotate(-90deg);">' + (agencylabels[Math.floor(val)]) + "</b>";
+
+ } else {
+ return "";
+ }
+ }
+ <?php
+ try {
+ $rows = $foidocsdb->get_view("app", "byAgencyID?group=true",null, false,false,true)->rows;
- $dataValues = Array();
- $i = 0;
- foreach ($rows as $row) {
- echo " d2.push([".$i.", $row->value]);" . PHP_EOL;
- echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
-
- $i++;
+ $dataValues = Array();
+ $i = 0;
+ foreach ($rows as $row) {
+ echo " d2.push([".$i.", $row->value]);" . PHP_EOL;
+ echo " agencylabels.push(['".str_replace("'","",$idtoname[$row->key])."']);" . PHP_EOL;
+
+ $i++;
+ }
+ } catch (SetteeRestClientException $e) {
+ setteErrorHandler($e);
}
- } catch (SetteeRestClientException $e) {
- setteErrorHandler($e);
- }
- ?>
- // Draw the graph
- Flotr.draw(
- document.getElementById("byagency"),
- [d2],
- {
- bars : {
- show : true,
- horizontal : false,
- shadowSize : 0,
- barWidth : 0.5
- },
-mouse : {
- track : true,
- relative : true,
- trackFormatter: agencytrackformatter
- },
- yaxis : {
- min : 0,
- autoscaleMargin : 1
- },
- xaxis: {
- minorTickFreq: 1,
- noTicks: agencylabels.length,
- showMinorLabels: true,
- tickFormatter: agencytickformatter
- },
- legend: {
- show: false
- }
- }
- );
+ ?>
+ // Draw the graph
+ Flotr.draw(
+ document.getElementById("byagency"),
+ [d2],
+ {
+ bars: {
+ show: true,
+ horizontal: false,
+ shadowSize: 0,
+ barWidth: 0.5
+ },
+ mouse: {
+ track: true,
+ relative: true,
+ trackFormatter: agencytrackformatter
+ },
+ yaxis: {
+ min: 0,
+ autoscaleMargin: 1
+ },
+ xaxis: {
+ minorTickFreq: 1,
+ noTicks: agencylabels.length,
+ showMinorLabels: true,
+ tickFormatter: agencytickformatter
+ },
+ legend: {
+ show: false
+ }
+ }
+ );
</script>
<?php
--- a/documents/crossdomain.xml
+++ b/documents/crossdomain.xml
@@ -3,24 +3,23 @@
<cross-domain-policy>
-<!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
+ <!-- Read this: www.adobe.com/devnet/articles/crossdomain_policy_file_spec.html -->
-<!-- Most restrictive policy: -->
- <site-control permitted-cross-domain-policies="none"/>
+ <!-- Most restrictive policy: -->
+ <site-control permitted-cross-domain-policies="none"/>
-
-<!-- Least restrictive policy: -->
-<!--
- <site-control permitted-cross-domain-policies="all"/>
- <allow-access-from domain="*" to-ports="*" secure="false"/>
- <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
--->
-<!--
- If you host a crossdomain.xml file with allow-access-from domain="*"
- and don’t understand all of the points described here, you probably
- have a nasty security vulnerability. ~ simon willison
--->
+ <!-- Least restrictive policy: -->
+ <!--
+ <site-control permitted-cross-domain-policies="all"/>
+ <allow-access-from domain="*" to-ports="*" secure="false"/>
+ <allow-http-request-headers-from domain="*" headers="*" secure="false"/>
+ -->
+ <!--
+ If you host a crossdomain.xml file with allow-access-from domain="*"
+ and don’t understand all of the points described here, you probably
+ have a nasty security vulnerability. ~ simon willison
+ -->
</cross-domain-policy>
--- /dev/null
+++ b/documents/datagov.py
@@ -1,1 +1,48 @@
+import sys, os
+import scrape
+from bs4 import BeautifulSoup
+
+
+listurl = "http://data.gov.au/data/"
+(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
+ listurl, "data", "AGIMO")
+soup = BeautifulSoup(datasetlisthtml)
+for atag in soup.find_all(class_='result-title'):
+ if atag.has_key('href'):
+ url = scrape.fullurl(listurl, atag['href'])
+ (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+ url, "data", "AGIMO")
+ hash = scrape.mkhash(scrape.canonurl(url))
+ doc = scrape.docsdb.get(hash)
+ if "metadata" not in doc.keys():
+ doc['metadata'] = {}
+ soup = BeautifulSoup(html)
+ for metatag in soup.find_all('meta'):
+ if metatag.has_key('name'):
+ doc['metadata'][metatag['name']] = metatag['content']
+ for list in soup.find_all('dl'):
+ last_title = ""
+ for child in list.children:
+ if str(type(child)) != "<class 'bs4.element.NavigableString'>":
+ if child.name == 'dt' and child.string != None:
+ last_title = child.string.strip()
+ if child.name == 'dd':
+ #print last_title
+ if last_title == "Download":
+ for item in child.find_all("li"):
+ link = item.find("a")
+ format = item.find(property="dc:format")
+ linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
+ "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+ doc['metadata'][last_title] = linkobj
+
+ else:
+ atags = child.find_all('a')
+ if len(atags) < 2:
+ [s.extract() for s in child(class_='viewAll')]
+ doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
+ else:
+ doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
+ print doc['metadata']
+ sys.exit("ggg")
--- a/documents/date.php
+++ b/documents/date.php
@@ -5,8 +5,11 @@
include_once('../include/common.inc.php');
$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99');
?>
-<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in one place!</div>
-<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a><br>
+<div class="headline">Read all the information released by Australian Federal Government agencies under the FOI Act in
+ one place!
+</div>
+<a style='float:right' href="rss.xml.php"><img src="img/feed-icon-14x14.png" alt="RSS Icon"/> All Agencies RSS Feed</a>
+<br>
<?php
/*$agenciesdb = $server->get_db('disclosr-agencies');
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -34,10 +34,10 @@
if (isset($row->value->FOIDocumentsURL)) {
$disclogs++;
echo '<a href="' . $row->value->FOIDocumentsURL . '">'
- . $row->value->FOIDocumentsURL . '</a>';
+ . $row->value->FOIDocumentsURL . '</a>';
if ($ENV == "DEV")
echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
- . 'view local copy</a>)</small>';
+ . 'view local copy</a>)</small>';
} else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>";
}
@@ -49,11 +49,11 @@
} else if (file_exists("./scrapers/" . $row->id . '.txt')) {
if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") {
echo "<font color='yellow'><abbr title='No log table exists at URL to scrape'><b>◎</b></abbr></font>";
- $yellow++;
+ $yellow++;
} else {
echo file_get_contents("./scrapers/" . $row->id . '.txt');
- echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
- $orange++;
+ echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
+ $orange++;
}
} else {
echo "<font color='red'><abbr title='No'>✘</abbr></font>";
@@ -69,7 +69,7 @@
}
echo "</table>";
echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; "
- . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
+ . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers ";
include_footer_documents();
?>
--- a/documents/exportAll.csv.php
+++ b/documents/exportAll.csv.php
@@ -39,7 +39,7 @@
if (is_array($agencyArray[$fieldName])) {
$row[] = implode(";", $agencyArray[$fieldName]);
} else {
- $row[] = str_replace(Array("\n", '"', "\t"),"",$agencyArray[$fieldName]);
+ $row[] = str_replace(Array("\n", '"', "\t"), "", $agencyArray[$fieldName]);
}
} else {
$row[] = "";
--- /dev/null
+++ b/documents/gazette.py
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -1,5 +1,6 @@
import sys
import os
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape
from bs4 import BeautifulSoup
@@ -51,12 +52,12 @@
""" do the scraping """
return
+
class GenericHTMLDisclogScraper(GenericDisclogScraper):
-
def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
- self.getURL(), "foidocuments", self.getAgencyID())
+ self.getURL(), "foidocuments", self.getAgencyID())
content = rcontent
dochash = scrape.mkhash(content)
doc = foidocsdb.get(dochash)
@@ -66,33 +67,32 @@
last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
if last_attach != None:
html_diff = difflib.HtmlDiff()
- description = description + "\nChanges: "
- description = description + html_diff.make_table(last_attach.read().split('\n'),
- content.split('\n'))
+ diff = html_diff.make_table(last_attach.read().split('\n'),
+ content.split('\n'))
edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
- , 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ , 'url': self.getURL(), 'docID': dochash,
+ "date": edate, "title": "Disclosure Log Updated", "description": description, "diff": diff}
foidocsdb.save(doc)
else:
print "already saved"
+
class GenericPDFDisclogScraper(GenericDisclogScraper):
-
def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
- self.getURL(), "foidocuments", self.getAgencyID())
+ self.getURL(), "foidocuments", self.getAgencyID())
laparams = LAParams()
rsrcmgr = PDFResourceManager(caching=True)
outfp = StringIO()
device = TextConverter(rsrcmgr, outfp, codec='utf-8',
- laparams=laparams)
+ laparams=laparams)
fp = StringIO()
fp.write(content)
process_pdf(rsrcmgr, device, fp, set(), caching=True,
- check_extractable=True)
+ check_extractable=True)
description = outfp.getvalue()
fp.close()
device.close()
@@ -103,19 +103,18 @@
print "saving " + dochash
edate = date.today().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
- , 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ , 'url': self.getURL(), 'docID': dochash,
+ "date": edate, "title": "Disclosure Log Updated", "description": description}
foidocsdb.save(doc)
else:
print "already saved"
class GenericDOCXDisclogScraper(GenericDisclogScraper):
-
def doScrape(self):
foidocsdb = scrape.couch['disclosr-foidocuments']
(url, mime_type, content) = scrape.fetchURL(scrape.docsdb
- , self.getURL(), "foidocuments", self.getAgencyID())
+ , self.getURL(), "foidocuments", self.getAgencyID())
mydoc = zipfile.ZipFile(file)
xmlcontent = mydoc.read('word/document.xml')
document = etree.fromstring(xmlcontent)
@@ -125,7 +124,7 @@
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
- ## Print our documnts test with two newlines under each paragraph
+ ## Print our documnts test with two newlines under each paragraph
description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
dochash = scrape.mkhash(description)
doc = foidocsdb.get(dochash)
@@ -134,42 +133,42 @@
print "saving " + dochash
edate = time().strftime("%Y-%m-%d")
doc = {'_id': dochash, 'agencyID': self.getAgencyID()
- , 'url': self.getURL(), 'docID': dochash,
- "date": edate, "title": "Disclosure Log Updated", "description": description}
+ , 'url': self.getURL(), 'docID': dochash,
+ "date": edate, "title": "Disclosure Log Updated", "description": description}
foidocsdb.save(doc)
else:
print "already saved"
class GenericRSSDisclogScraper(GenericDisclogScraper):
-
- def doScrape(self):
- foidocsdb = scrape.couch['disclosr-foidocuments']
- (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
- self.getURL(), "foidocuments", self.getAgencyID())
- feed = feedparser.parse(content)
- for entry in feed.entries:
- #print entry
- print entry.id
- dochash = scrape.mkhash(entry.id)
- doc = foidocsdb.get(dochash)
- #print doc
- if doc is None:
- print "saving " + dochash
- edate = datetime.fromtimestamp(
- mktime(entry.published_parsed)).strftime("%Y-%m-%d")
-