From: Maxious Date: Tue, 18 Dec 2012 22:20:12 +0000 Subject: columns X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=7c78bd9f790ed5a9a2a3483346a9cb6449eddba1 --- columns Former-commit-id: 82edd5f41bab243828a5febd9e00b5fdb051dc86 --- --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,6 @@ [submodule "couchdb/couchdb-lucene"] path = couchdb/couchdb-lucene url = https://github.com/rnewson/couchdb-lucene.git -[submodule "couchdb/settee"] - path = couchdb/settee - url = https://github.com/inadarei/settee.git [submodule "lib/php-diff"] path = lib/php-diff url = https://github.com/chrisboulton/php-diff.git @@ -31,4 +28,7 @@ [submodule "documents/lib/parsedatetime"] path = documents/lib/parsedatetime url = git://github.com/bear/parsedatetime.git +[submodule "lib/FeedWriter"] + path = lib/FeedWriter + url = https://github.com/mibe/FeedWriter --- a/admin/exportEmployees.csv.php +++ b/admin/exportEmployees.csv.php @@ -4,7 +4,8 @@ $format = "csv"; //$format = "json"; -if (isset($_REQUEST['format'])) $format = $_REQUEST['format']; +if (isset($_REQUEST['format'])) + $format = $_REQUEST['format']; setlocale(LC_CTYPE, 'C'); if ($format == "csv") { $headers = Array("name"); @@ -21,7 +22,6 @@ if (isset($row->value->statistics->employees)) { $headers = array_unique(array_merge($headers, array_keys(object_to_array($row->value->statistics->employees)))); - } } } catch (SetteeRestClientException $e) { @@ -40,15 +40,14 @@ fputcsv($fp, $headers); } else if ($format == "json") { echo '{ - "labels" : ["' . implode('","', $headers) . '"],'.PHP_EOL; + "labels" : ["' . implode('","', $headers) . '"],' . PHP_EOL; } try { $agencies = $db->get_view("app", "all", null, true)->rows; //print_r($agencies); $first = true; if ($format == "json") { - echo '"data" : ['.PHP_EOL; - + echo '"data" : [' . PHP_EOL; } foreach ($agencies as $agency) { @@ -56,25 +55,35 @@ $row = Array(); $agencyEmployeesArray = object_to_array($agency->value->statistics->employees); foreach ($headers as $i => $fieldName) { + if ($format == "csv") { + if (isset($agencyEmployeesArray[$fieldName])) { + $row[] = $agencyEmployeesArray[$fieldName]["value"] ; + } else if ($i == 0) { + $row[] = $agency->value->name; + } else { + $row[] = 0; + } + } else if ($format == "json") { if (isset($agencyEmployeesArray[$fieldName])) { - $row[] = '['.$i.','.$agencyEmployeesArray[$fieldName]["value"].']'; + $row[] = '[' . $i . ',' . $agencyEmployeesArray[$fieldName]["value"] . ']'; } else { - $row[] = '['.$i.',0]'; + $row[] = '[' . $i . ',0]'; } + } } if ($format == "csv") { fputcsv($fp, array_values($row)); } else if ($format == "json") { - if (!$first) echo ","; - echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "'.$agency->value->name.'", "lines" : { "show" : true }, "points" : { "show" : true }}'.PHP_EOL; + if (!$first) + echo ","; + echo '{"data" : [' . implode(",", array_values($row)) . '], "label": "' . $agency->value->name . '", "lines" : { "show" : true }, "points" : { "show" : true }}' . PHP_EOL; $first = false; } } } if ($format == "json") { - echo '] - }'.PHP_EOL; - + echo '] + }' . PHP_EOL; } } catch (SetteeRestClientException $e) { setteErrorHandler($e); --- a/admin/genericAgencyFixer.php +++ b/admin/genericAgencyFixer.php @@ -7,28 +7,48 @@ $db = $server->get_db('disclosr-agencies'); +// metatags +try { + $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; + //print_r($rows); + foreach ($agencies as $agency) { + if (isset($agency->value->scrapeDepth)) { + unset($agency->value->scrapeDepth); + } + if (isset($agency->value->lastScraped)) { + unset($agency->value->lastScraped); + } + $db->save($agency->value); + echo "
"; + flush(); + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +// metatags try { $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; //print_r($rows); foreach ($agencies as $agency) { //echo $agency->value->name . " ".$agency->value->website."
\n"; - // print_r($agency); + // print_r($agency); //hasRestricitiveLicence" hasRestrictiveLicense -> has Restrictive Licence // "hasYoutube" -> Tube // "comment" -> "comments" if (!isset($agency->value->metaTags) && isset($agency->value->website)) { - echo $agency->value->name . " ".$agency->value->website."
\n"; + echo $agency->value->name . " " . $agency->value->website . "
\n"; $agency->value->metaTags = Array(); $request = Requests::get($agency->value->website); $html = phpQuery::newDocumentHTML($request->body); phpQuery::selectDocument($html); foreach (pq('meta')->elements as $meta) { - $tagName = $meta->getAttribute('name');; + $tagName = $meta->getAttribute('name'); + ; $content = $meta->getAttribute('content'); if ($tagName != "") { -echo "$tagName == $content
\n"; - $agency->value->metaTags[$tagName] = $content; + echo "$tagName == $content
\n"; + $agency->value->metaTags[$tagName] = $content; } } //print_r($agency->value->metaTags); --- a/admin/importAPSCEmployees.php +++ b/admin/importAPSCEmployees.php @@ -47,13 +47,17 @@ $changed = false; if (!isset($doc->statistics)) { $changed = true; - $doc->statistics = Array(); + $doc->statistics = new stdClass(); + } + if (!isset($doc->statistics->employees)) { + $changed = true; + $doc->statistics->employees = new stdClass(); } foreach ($sum as $timePeriod => $value) { if (!isset($doc->statistics->employees->$timePeriod->value) || $doc->statistics->employees->$timePeriod->value != $value) { $changed = true; - $doc->statistics["employees"][$timePeriod] = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/"); + $doc->statistics->employees->$timePeriod = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/"); } } if ($changed) { --- /dev/null +++ b/admin/importAPSCEmployees2012.php @@ -1,1 +1,86 @@ +get_db('disclosr-agencies'); +$rows = $db->get_view("app", "byName")->rows; +$nametoid = Array(); +$sums = Array(); +$functions = Array(); +foreach ($rows as $row) { + $nametoid[trim($row->key)] = $row->value; +} + + +$request = Requests::get("http://www.apsc.gov.au/publications-and-media/parliamentary/state-of-the-service/new-sosr/appendix-2-aps-agencies"); +$doc = phpQuery::newDocumentHTML($request->body); +phpQuery::selectDocument($doc); +foreach (pq('tr')->elements as $tr) { + //echo $tr->nodeValue.PHP_EOL; + $agency = ""; + $employees = ""; + $function = ""; + $i = 0; + foreach ($tr->childNodes as $td) { + //echo $td->nodeValue." $i
"; + if ($i == 0) + $agency = $td->nodeValue; + if ($i == 2) { + $employees = trim(str_replace(",", "", $td->nodeValue)); + } + if ($i == 4) { + $function = $td->nodeValue; + } + $i++; + } + if ($agency != "" && $employees != "" && $function != "") { + $name = trim(str_replace('2','',$agency)); + //echo "$name

" . PHP_EOL; + if (isset($nametoid[$name])) { + $id = $nametoid[$name]; + //echo $id . "
" . PHP_EOL; + @$sums[$id]["2011-2012"] += $employees; + $functions[$id] = $function; + } else if ($agency != "Agency"){ + echo "
ERROR NAME '$agency' MISSING FROM ID LIST

" . PHP_EOL; + + die(); + } + } else { + echo "skipped $agency"; + } +} +//print_r($sums); +foreach ($sums as $id => $sum) { + echo $id . "
" . PHP_EOL; + $doc = $db->get($id); + echo $doc->name . "
" . PHP_EOL; + // print_r($doc); + $changed = false; + if (!isset($doc->statistics)) { + $changed = true; + $doc->statistics = new stdClass(); + } + if (!isset($doc->statistics->employees)) { + $changed = true; + $doc->statistics->employees = new stdClass(); + } + foreach ($sum as $timePeriod => $value) { + if (!isset($doc->statistics->employees->$timePeriod->value) + || $doc->statistics->employees->$timePeriod->value != $value) { + $changed = true; + $doc->statistics->employees->$timePeriod = Array("value" => $value, "source" => "http://apsc.gov.au/stateoftheservice/"); + $doc->employees = $value; + $doc->functionClassification = $functions[$id]; + } + } + + if ($changed) { + $db->save($doc); + } else { + echo "not changed" . "
" . PHP_EOL; + } +} +// employees: timeperiod, source = apsc state of service, value +?> + --- a/admin/importRTKbodies.php +++ b/admin/importRTKbodies.php @@ -29,6 +29,7 @@ } else { echo $Row[array_search($nameField, $headers)] . PHP_EOL; $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)]; + $accounts[$nametoid[trim($agencyName)]]["rtkDescriptions"][$agencyName] = $Row[array_search("Notes", $headers)]; } } else { echo "error finding any agency" . $line . PHP_EOL; @@ -38,19 +39,26 @@ } extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name"); -print_r($accounts); -/* foreach ($accounts as $id => $accountTypes) { +//print_r($accounts); + foreach ($accounts as $id => $allvalues) { echo $id . "
" . PHP_EOL; $doc = object_to_array($db->get($id)); // print_r($doc); - foreach ($accountTypes as $accountType => $accounts) { - if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { - $doc["has" . $accountType] = Array(); + foreach ($allvalues as $valueType => $values) { + if (!isset($doc[ $valueType]) || !is_array($doc[ $valueType])) { + $doc[ $valueType] = Array(); } - $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); + $doc[ $valueType] = array_unique(array_merge($doc[ $valueType], $values)); + if ( $valueType == "rtkDescriptions") { + foreach ($values as $descriptionAgency => $descriptionValue) { + if ($descriptionAgency == $doc->value->name) { + $doc->value->description = $descriptionValue; + } + } + } } $db->save($doc); -}*/ +} ?> --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -4,74 +4,61 @@ //function createFOIDocumentsDesignDoc() { $foidb = $server->get_db('disclosr-foidocuments'); - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode("app"); - $obj->language = "javascript"; - $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; - $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; - $obj->views->byDate->reduce = "_count"; - $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; - $obj->views->byAgencyID->reduce = "_count"; +$obj = new stdClass(); +$obj->_id = "_design/" . urlencode("app"); +$obj->language = "javascript"; +$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; +$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; +$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; +$obj->views->byDateMonthYear->reduce = "_count"; +$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; +$obj->views->byAgencyID->reduce = "_count"; - // allow safe updates (even if slightly slower due to extra: rev-detection check). - $foidb->save($obj, true); +// allow safe updates (even if slightly slower due to extra: rev-detection check). +$foidb->save($obj, true); -function createDocumentsDesignDoc() { - /* - global $db; - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode("app"); - $obj->language = "javascript"; - $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; - $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; - "views": { - "web_server": { - "map": "function(doc) {\n emit(doc.web_server, 1);\n}", - "reduce": "function (key, values, rereduce) {\n return sum(values);\n}" - }, - "byAgency": { - "map": "function(doc) {\n emit(doc.agencyID, 1);\n}", - "reduce": "function (key, values, rereduce) {\n return sum(values);\n}" - }, - "byURL": { - "map": "function(doc) {\n emit(doc.url, doc);\n}" - }, - "agency": { - "map": "function(doc) {\n emit(doc.agencyID, doc);\n}" - }, - "byWebServer": { - "map": "function(doc) {\n emit(doc.web_server, doc);\n}" - }, - "getValidationRequired": { - "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}" - } - } */ -} +//function createDocumentsDesignDoc() { +$docdb = $server->get_db('disclosr-documents'); + +$obj = new stdClass(); +$obj->_id = "_design/" . urlencode("app"); +$obj->language = "javascript"; +$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; +$obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; +$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; +$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; +$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; +$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; +$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; +$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; + + + //function createAgencyDesignDoc() { $db = $server->get_db('disclosr-agencies'); - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode("app"); - $obj->language = "javascript"; - $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; - $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; - $obj->views->byCanonicalName->map = "function(doc) { +$obj = new stdClass(); +$obj->_id = "_design/" . urlencode("app"); +$obj->language = "javascript"; +$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; +$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; +$obj->views->byCanonicalName->map = "function(doc) { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { emit(doc.name, doc); } };"; - $obj->views->byDeptStateName->map = "function(doc) { +$obj->views->byDeptStateName->map = "function(doc) { if (doc.orgType == 'FMA-DepartmentOfState') { emit(doc.name, doc._id); } };"; - $obj->views->parentOrgs->map = "function(doc) { +$obj->views->parentOrgs->map = "function(doc) { if (doc.parentOrg) { emit(doc._id, doc.parentOrg); } };"; - $obj->views->byName->map = 'function(doc) { +$obj->views->byName->map = 'function(doc) { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { emit(doc.name, doc._id); if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { @@ -95,14 +82,14 @@ } };'; - $obj->views->foiEmails->map = "function(doc) { +$obj->views->foiEmails->map = "function(doc) { emit(doc._id, doc.foiEmail); };"; - $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; - $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; - $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; - $obj->views->getScrapeRequired->map = "function(doc) { +$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; +$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; +$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; +$obj->views->getScrapeRequired->map = "function(doc) { var lastScrape = Date.parse(doc.metadata.lastScraped); @@ -113,14 +100,14 @@ } };"; - $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; - $obj->views->getConflicts->map = "function(doc) { +$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; +$obj->views->getConflicts->map = "function(doc) { if (doc._conflicts) { emit(null, [doc._rev].concat(doc._conflicts)); } }"; - // http://stackoverflow.com/questions/646628/javascript-startswith - $obj->views->score->map = 'if(!String.prototype.startsWith){ +// http://stackoverflow.com/questions/646628/javascript-startswith +$obj->views->score->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { return !this.indexOf(str); } @@ -144,7 +131,7 @@ emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); } }'; - $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ +$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { return !this.indexOf(str); } @@ -164,22 +151,20 @@ emit("total", 1); } }'; - $obj->views->scoreHas->reduce = 'function (key, values, rereduce) { +$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { return sum(values); }'; - $obj->views->fieldNames->map = ' +$obj->views->fieldNames->map = ' function(doc) { for(var propName in doc) { emit(propName, doc._id); } }'; - $obj->views->fieldNames->reduce = 'function (key, values, rereduce) { +$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }'; - // allow safe updates (even if slightly slower due to extra: rev-detection check). - $db->save($obj, true); - - +// allow safe updates (even if slightly slower due to extra: rev-detection check). +$db->save($obj, true); ?> --- a/couchdb/SetteeDatabase.class.php +++ /dev/null @@ -1,306 +1,1 @@ -conn_url = $conn_url; - $this->dbname = $dbname; - $this->rest_client = SetteeRestClient::get_instance($this->conn_url); - } - - - /** - * Get UUID from CouchDB - * - * @return - * CouchDB-generated UUID string - * - */ - function gen_uuid() { - $ret = $this->rest_client->http_get('_uuids'); - return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking - } - - /** - * Create or update a document database - * - * @param $document - * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically. - * - *

If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation). - * If "_id" is missing, CouchDB will be used to generate a UUID. - * - *

If $document has a "_rev" property (revision), document will be updated, rather than creating a new document. - * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be - * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but - * not provide "_id" since that is an invalid input. - * - * @param $allowRevAutoDetection - * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision - * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and - * therefore can make save() operation slightly slower if such auto-detection is not required. - * - * @return - * document object with the database id (uuid) and revision attached; - * - * @throws SetteeCreateDatabaseException - */ - function save($document, $allowRevAutoDetection = false) { - if (is_string($document)) { - $document = json_decode($document); - } - - // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter) - if(is_array($document)) { - $document = (object) $document; - } - - if (empty($document->_id) && empty($document->_rev)) { - $id = $this->gen_uuid(); - } - elseif (empty($document->_id) && !empty($document->_rev)) { - throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id"); - } - else { - $id = $document->_id; - - if ($allowRevAutoDetection) { - try { - $rev = $this->get_rev($id); - } catch (SetteeRestClientException $e) { - // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error - } - if (!empty($rev)) { - $document->_rev = $rev; - } - } - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); - $document_json = json_encode($document, JSON_NUMERIC_CHECK); - - $ret = $this->rest_client->http_put($full_uri, $document_json); - - $document->_id = $ret['decoded']->id; - $document->_rev = $ret['decoded']->rev; - - return $document; - } - - /** - * @param $doc - * @param $name - * @param $content - * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for - * you, so you don't have to do it. - * @param $mime_type - * Optional. Will be auto-detected if not provided - * @return void - */ - public function add_attachment($doc, $name, $content, $mime_type = null) { - if (empty($doc->_attachments) || !is_object($doc->_attachments)) { - $doc->_attachments = new stdClass(); - } - - if (empty($mime_type)) { - $mime_type = $this->rest_client->content_mime_type($content); - } - - $doc->_attachments->$name = new stdClass(); - $doc->_attachments->$name->content_type = $mime_type; - $doc->_attachments->$name->data = base64_encode($content); - } - - /** - * @param $doc - * @param $name - * @param $file - * Full path to a file (e.g. as returned by PHP's realpath function). - * @param $mime_type - * Optional. Will be auto-detected if not provided - * @return void - */ - public function add_attachment_file($doc, $name, $file, $mime_type = null) { - $content = file_get_contents($file); - $this->add_attachment($doc, $name, $content, $mime_type); - } - - /** - * - * Retrieve a document from CouchDB - * - * @throws SetteeWrongInputException - * - * @param $id - * Unique ID (usually: UUID) of the document to be retrieved. - * @return - * database document in PHP object format. - */ - function get($id) { - if (empty($id)) { - throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); -$full_uri = str_replace("%3Frev%3D","?rev=",$full_uri); - $ret = $this->rest_client->http_get($full_uri); - return $ret['decoded']; - } - - /** - * - * Get the latest revision of a document with document id: $id in CouchDB. - * - * @throws SetteeWrongInputException - * - * @param $id - * Unique ID (usually: UUID) of the document to be retrieved. - * @return - * database document in PHP object format. - */ - function get_rev($id) { - if (empty($id)) { - throw new SetteeWrongInputException("Error: Can't query a document without a uuid."); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); - $headers = $this->rest_client->http_head($full_uri); - if (empty($headers['Etag'])) { - throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag"); - } - $etag = str_replace('"', '', $headers['Etag']); - return $etag; - } - - /** - * Delete a document - * - * @param $document - * a PHP object or JSON representation of the document that has _id and _rev fields. - * - * @return void - */ - function delete($document) { - if (!is_object($document)) { - $document = json_decode($document); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev; - $this->rest_client->http_delete($full_uri); - } - - - /*----------------- View-related functions --------------*/ - - /** - * Create a new view or update an existing one. - * - * @param $design_doc - * @param $view_name - * @param $map_src - * Source code of the map function in Javascript - * @param $reduce_src - * Source code of the reduce function in Javascript (optional) - * @return void - */ - function save_view($design_doc, $view_name, $map_src, $reduce_src = null) { - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode($design_doc); - $view_name = urlencode($view_name); - $obj->views->$view_name->map = $map_src; - if (!empty($reduce_src)) { - $obj->views->$view_name->reduce = $reduce_src; - } - - // allow safe updates (even if slightly slower due to extra: rev-detection check). - return $this->save($obj, true); - } - - /** - * Create a new view or update an existing one. - * - * @param $design_doc - * @param $view_name - * @param $key - * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes - * that first element is startkey, second: endkey. - * @param $descending - * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change - * order you also need to swap startkey and endkey values! - * - * @return void - */ - function get_view($design_doc, $view_name, $key = null, $descending = false) { - $id = "_design/" . urlencode($design_doc); - $view_name = urlencode($view_name); - $id .= "/_view/$view_name"; - - $data = array(); - if (!empty($key)) { - if (is_string($key)) { - $data = "key=" . '"' . $key . '"'; - } - elseif (is_array($key)) { - list($startkey, $endkey) = $key; - $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"'; - } - - if ($descending) { - $data .= "&descending=true"; - } - } - - - - if (empty($id)) { - throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); -$full_uri = str_replace("%253Fgroup%253Dtrue","?group=true",$full_uri); - $ret = $this->rest_client->http_get($full_uri, $data); - return $ret['decoded']; - - } - - /** - * @param $id - * @return - * return a properly url-encoded id. - */ - private function safe_urlencode($id) { - //-- System views like _design can have "/" in their URLs. - $id = rawurlencode($id); - if (substr($id, 0, 1) == '_') { - $id = str_replace('%2F', '/', $id); - } - return $id; - } - - /** Getter for a database name */ - function get_name() { - return $this->dbname; - } - -} --- a/couchdb/settee +++ /dev/null --- /dev/null +++ b/couchdb/settee/.travis.yml @@ -1,1 +1,6 @@ +language: php +phps: + - 5.3 + - 5.4 +before_script: cd tests/ --- /dev/null +++ b/couchdb/settee/LICENSE.txt @@ -1,1 +1,9 @@ +(The MIT License) +Copyright (c) 2011 Irakli Nadareishvili + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --- /dev/null +++ b/couchdb/settee/README.textile @@ -1,1 +1,60 @@ +Inspired by: "CouchRest library for Ruby":http://jchrisa.net/drl/_design/sofa/_list/post/post-page?startkey=%5B%22couchrest__restful_ruby_client_%22%5D and the "couchdb-python":http://packages.python.org/CouchDB/client.html#document library. +h3. Server Functions + +# Specify a server: +@$server = new SetteeServer('http://127.0.0.1:5984');@ +# Database API +## Create a database: +@$ret = $server->create_db('irakli_test');@ +## Drop a database: +@$ret = $server->drop_db('irakli_test');@ +## List all databases: +@$ret = $server->list_dbs();@ +## Get a database object +@$db = $server->get_db('irakli_test');@ +# Document API +## Create/Update a document: +@$ret = $db->save($doc);@ +## Retrieve a document: +@$db_doc = $db->get($id);@ +## Determine the latest revision_id for a document: +@$rev = $db->get_rev($id);@ +## Delete a document: +@$db_doc = $db->delete($doc);@ +# Attachments API +## Add content as attachment: +@$db->add_attachment($doc, "foo.txt", "Some text that will be base64 encoded", "text/plain");@ +## Add a file path to be attached: +@$db->add_attachment_file($doc, "foo.pdf", $file_path, "application/pdf");@ +## Add a file path to be attached (mime-type is auto-detected): +@$db->add_attachment_file($doc, "foo.pdf", $file_path);@ +## Full attachment saving example: + $doc = new stdClass(); + $doc->_id = "attachment_doc"; + $file_path = dirname(__FILE__) . "/resources/couch-logo.pdf"; + $this->db->add_attachment_file($doc, "foo.pdf", $file_path, "application/pdf"); + $db_doc = $this->db->save($doc); +## ATTENTION: there is no "load_attachments" method, because when you load a document, all its attachments get loaded with it, as well. +# Views API +## Create a new view or save a view: +@$view = $db->save_view("some_design_document_id", "a_view_name", $map_src);@ +@$view = $db->save_view("some_design_document_id", "a_view_name", $map_src, $reduce_src);@ +## Get a view (run query and get results): +@$view = $db->get_view("some_design_document_id", "a_view_name");@ +## Parametrized view: +@$view = $db->get_view("some_design_document_id", "a_view_name", "2009/02/17 21:13:39");@ +## Parametrized view with key range: +@$view = $db->get_view("some_design_document_id", "a_view_name", array("2009/01/30 18:04:11", "2009/02/17 21:13:39"));@ +## Parametrized view with key range, ordered descending: +@$view = $db->get_view("some_design_document_id", "a_view_name", array("2009/01/30 18:04:11", "2009/02/17 21:13:39"), true);@ + + +h3. Requirements +# PHP 5.2 or newer + +h3. Recommended +# PHP 5.3 or newer. With PHP 5.2 following functionality will not work: +## Some unit-tests +## Mime type auto-detection. +# pecl_http --- /dev/null +++ b/couchdb/settee/examples/db.ops.php @@ -1,1 +1,50 @@ +#!/usr/bin/env php + "settee_test_perf_01", + 2 => "settee_test_perf_02", + 3 => "settee_test_perf_03", +); + +print ("creating databases: \n"); + +foreach ($dbs as $db) { + $start = microtime(true); + try { + $ret = $server->create_db($db); + } catch (Exception $e) { + //-- re-throw. this is just for demo + throw $e; + } + $elapsed = microtime(true) - $start; + print("Time elapsed: $elapsed \n"); +} + +$ret = $server->list_dbs(); +print_r($ret); +print ("\n"); + +print ("dropping databases: \n"); + +foreach ($dbs as $db) { + $start = microtime(true); + try { + $ret = $server->drop_db($db); + } catch (Exception $e) { + //-- re-throw. this is just for demo + throw $e; + } + $elapsed = microtime(true) - $start; + print("Time elapsed: $elapsed \n"); +} + +$ret = $server->list_dbs(); +print_r($ret); + --- /dev/null +++ b/couchdb/settee/examples/doc.ops.php @@ -1,1 +1,40 @@ +#!/usr/bin/env php +get_db('irakli'); + +try { + $server->create_db($db); +} catch (Exception $e) { + print_r("database irakli already exists! \n"); +} + +$doc = new StdClass(); +$doc->firstName = "Irakli"; +$doc->lastName = "Nadareishvili"; +$doc->IQ = 200; +$doc->hobbies = array("skiing", "swimming"); +$doc->pets = array ("whitey" => "labrador", "mikey" => "pug"); + +// Should work with json string as well: +//$doc = '{"firstName":"irakli","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}'; + +$doc = $db->save($doc); +print_r($doc); + +$doc = $db->get($doc->_id); +print_r($doc); + +$doc->firstName = "Ika"; +$doc = $db->save($doc); +print_r($doc); + +$db->delete($doc); + + + --- /dev/null +++ b/couchdb/settee/src/classes/SetteeDatabase.class.php @@ -1,1 +1,316 @@ - +conn_url = $conn_url; + $this->dbname = $dbname; + $this->rest_client = SetteeRestClient::get_instance($this->conn_url); + } + + /** + * Get UUID from CouchDB + * + * @return + * CouchDB-generated UUID string + * + */ + function gen_uuid() { + $ret = $this->rest_client->http_get('_uuids'); + return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking + } + + /** + * Create or update a document database + * + * @param $document + * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically. + * + *

If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation). + * If "_id" is missing, CouchDB will be used to generate a UUID. + * + *

If $document has a "_rev" property (revision), document will be updated, rather than creating a new document. + * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be + * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but + * not provide "_id" since that is an invalid input. + * + * @param $allowRevAutoDetection + * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision + * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and + * therefore can make save() operation slightly slower if such auto-detection is not required. + * + * @return + * document object with the database id (uuid) and revision attached; + * + * @throws SetteeCreateDatabaseException + */ + function save($document, $allowRevAutoDetection = false) { + if (is_string($document)) { + $document = json_decode($document); + } + + // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter) + if (is_array($document)) { + $document = (object) $document; + } + + if (empty($document->_id) && empty($document->_rev)) { + $id = $this->gen_uuid(); + } elseif (empty($document->_id) && !empty($document->_rev)) { + throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id"); + } else { + $id = $document->_id; + + if ($allowRevAutoDetection) { + try { + $rev = $this->get_rev($id); + } catch (SetteeRestClientException $e) { + // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error + } + if (!empty($rev)) { + $document->_rev = $rev; + } + } + } + + $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); + $document_json = json_encode($document, JSON_NUMERIC_CHECK); + + $ret = $this->rest_client->http_put($full_uri, $document_json); + + $document->_id = $ret['decoded']->id; + $document->_rev = $ret['decoded']->rev; + + return $document; + } + + /** + * @param $doc + * @param $name + * @param $content + * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for + * you, so you don't have to do it. + * @param $mime_type + * Optional. Will be auto-detected if not provided + * @return void + */ + public function add_attachment($doc, $name, $content, $mime_type = null) { + if (empty($doc->_attachments) || !is_object($doc->_attachments)) { + $doc->_attachments = new stdClass(); + } + + if (empty($mime_type)) { + $mime_type = $this->rest_client->content_mime_type($content); + } + + $doc->_attachments->$name = new stdClass(); + $doc->_attachments->$name->content_type = $mime_type; + $doc->_attachments->$name->data = base64_encode($content); + } + + /** + * @param $doc + * @param $name + * @param $file + * Full path to a file (e.g. as returned by PHP's realpath function). + * @param $mime_type + * Optional. Will be auto-detected if not provided + * @return void + */ + public function add_attachment_file($doc, $name, $file, $mime_type = null) { + $content = file_get_contents($file); + $this->add_attachment($doc, $name, $content, $mime_type); + } + + /** + * + * Retrieve a document from CouchDB + * + * @throws SetteeWrongInputException + * + * @param $id + * Unique ID (usually: UUID) of the document to be retrieved. + * @return + * database document in PHP object format. + */ + function get($id) { + if (empty($id)) { + throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); + } + + $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); + $full_uri = str_replace("%3Frev%3D", "?rev=", $full_uri); + $ret = $this->rest_client->http_get($full_uri); + return $ret['decoded']; + } + + /** + * + * Get the latest revision of a document with document id: $id in CouchDB. + * + * @throws SetteeWrongInputException + * + * @param $id + * Unique ID (usually: UUID) of the document to be retrieved. + * @return + * database document in PHP object format. + */ + function get_rev($id) { + if (empty($id)) { + throw new SetteeWrongInputException("Error: Can't query a document without a uuid."); + } + + $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); + $headers = $this->rest_client->http_head($full_uri); + if (empty($headers['Etag'])) { + throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag"); + } + $etag = str_replace('"', '', $headers['Etag']); + return $etag; + } + + /** + * Delete a document + * + * @param $document + * a PHP object or JSON representation of the document that has _id and _rev fields. + * + * @return void + */ + function delete($document) { + if (!is_object($document)) { + $document = json_decode($document); + } + + $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev; + $this->rest_client->http_delete($full_uri); + } + + /* ----------------- View-related functions -------------- */ + + /** + * Create a new view or update an existing one. + * + * @param $design_doc + * @param $view_name + * @param $map_src + * Source code of the map function in Javascript + * @param $reduce_src + * Source code of the reduce function in Javascript (optional) + * @return void + */ + function save_view($design_doc, $view_name, $map_src, $reduce_src = null) { + $obj = new stdClass(); + $obj->_id = "_design/" . urlencode($design_doc); + $view_name = urlencode($view_name); + $obj->views->$view_name->map = $map_src; + if (!empty($reduce_src)) { + $obj->views->$view_name->reduce = $reduce_src; + } + + // allow safe updates (even if slightly slower due to extra: rev-detection check). + return $this->save($obj, true); + } + + /** + * Create a new view or update an existing one. + * + * @param $design_doc + * @param $view_name + * @param $key + * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes + * that first element is startkey, second: endkey. + * @param $descending + * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change + * order you also need to swap startkey and endkey values! + * + * @return void + */ + function get_view($design_doc, $view_name, $key = null, $descending = false, $limit = false, $reduce = null, $startdocid = null) { + $id = "_design/" . urlencode($design_doc); + $view_name = urlencode($view_name); + $id .= "/_view/$view_name"; + + $data = array(); + if (!empty($key)) { + if (is_string($key)) { + $data = "key=" . '"' . $key . '"'; + } elseif (is_array($key)) { + list($startkey, $endkey) = $key; + $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"'; + } + + if ($descending) { + $data .= "&descending=true"; + } + if ($startdocid != null) { + $data .= "&startkey_docid='$startdocid'"; + } + if ($reduce === true) { + $data .= "&reduce=true"; + } else if ($reduce === false){ + + $data .= "&reduce=false"; + } + if ($limit) { + $data .= "&limit=" . $limit; + } + } + + + + if (empty($id)) { + throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); + } + + $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); + + $full_uri = str_replace("%253Fgroup%253D", "?group=", $full_uri); + $full_uri = str_replace("%253Flimit%253D", "?limit=", $full_uri); + $ret = $this->rest_client->http_get($full_uri, $data); + //$ret['decoded'] = str_replace("?k","&k",$ret['decoded']); + return $ret['decoded']; + } + + /** + * @param $id + * @return + * return a properly url-encoded id. + */ + private function safe_urlencode($id) { + //-- System views like _design can have "/" in their URLs. + $id = rawurlencode($id); + if (substr($id, 0, 1) == '_') { + $id = str_replace('%2F', '/', $id); + } + return $id; + } + + /** Getter for a database name */ + function get_name() { + return $this->dbname; + } + +} + --- /dev/null +++ b/couchdb/settee/src/classes/SetteeRestClient.class.php @@ -1,1 +1,247 @@ - +base_url = $base_url; + + $curl = curl_init(); + curl_setopt($curl, CURLOPT_USERAGENT, "Settee CouchDB Client/1.0"); + curl_setopt($curl, CURLOPT_HTTPHEADER, array('Content-Type: application/json')); + curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); + curl_setopt($curl, CURLOPT_HEADER, 0); + curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); + curl_setopt($curl, CURLOPT_TIMEOUT_MS, self::HTTP_TIMEOUT); + curl_setopt($curl, CURLOPT_FORBID_REUSE, false); // Connection-pool for CURL + + $this->curl = $curl; + + } + + /** + * Class destructor cleans up any resources + */ + function __destruct() { + curl_close($this->curl); + } + + /** + * HTTP HEAD + * + * @return + * Raw HTTP Headers of the response. + * + * @see: http://www.php.net/manual/en/context.params.php + * + */ + function http_head($uri) { + curl_setopt($this->curl, CURLOPT_HEADER, 1); + + $full_url = $this->get_full_url($uri); + curl_setopt($this->curl, CURLOPT_URL, $full_url); + curl_setopt($this->curl, CURLOPT_CUSTOMREQUEST, 'HEAD'); + curl_setopt($this->curl, CURLOPT_NOBODY, true); + + + $response = curl_exec($this->curl); + // Restore default values + curl_setopt($this->curl, CURLOPT_NOBODY, false); + curl_setopt($this->curl, CURLOPT_HEADER, false); + + $resp_code = curl_getinfo($this->curl, CURLINFO_HTTP_CODE); + if ($resp_code == 404 ) { + throw new SetteeRestClientException("Couch document not found at: '$full_url'"); + } + + if (function_exists('http_parse_headers')) { + $headers = http_parse_headers($response); + } + else { + $headers = $this->_http_parse_headers($response); + } + + return $headers; + } + + /** + * Backup PHP impl. for when PECL http_parse_headers() function is not available + * + * @param $header + * @return array + * @source http://www.php.net/manual/en/function.http-parse-headers.php#77241 + */ + private function _http_parse_headers( $header ) { + $retVal = array(); + $fields = explode("\r\n", preg_replace('/\x0D\x0A[\x09\x20]+/', ' ', $header)); + foreach( $fields as $field ) { + if( preg_match('/([^:]+): (.+)/m', $field, $match) ) { + $match[1] = preg_replace('/(?<=^|[\x09\x20\x2D])./e', 'strtoupper("\0")', strtolower(trim($match[1]))); + if( isset($retVal[$match[1]]) ) { + $retVal[$match[1]] = array($retVal[$match[1]], $match[2]); + } else { + $retVal[$match[1]] = trim($match[2]); + } + } + } + return $retVal; + } + + /** + * HTTP GET + */ + function http_get($uri, $data = array()) { + $data = (is_array($data)) ? http_build_query($data) : $data; + if (!empty($data)) { + $uri .= "?$data"; + } + return $this->http_request('GET', $uri); + } + + /** + * HTTP PUT + */ + function http_put($uri, $data = array()) { + return $this->http_request('PUT', $uri, $data); + } + + /** + * HTTP DELETE + */ + function http_delete($uri, $data = array()) { + return $this->http_request('DELETE', $uri, $data); + } + + /** + * Generic implementation of a HTTP Request. + * + * @param $http_method + * @param $uri + * @param array $data + * @return + * an array containing json and decoded versions of the response. + */ + private function http_request($http_method, $uri, $data = array()) { + $data = (is_array($data)) ? http_build_query($data) : $data; + + if (!empty($data)) { + curl_setopt($this->curl, CURLOPT_HTTPHEADER, array('Content-Length: ' . strlen($data))); + curl_setopt($this->curl, CURLOPT_POSTFIELDS, $data); + } + + curl_setopt($this->curl, CURLOPT_URL, $this->get_full_url($uri)); + curl_setopt($this->curl, CURLOPT_CUSTOMREQUEST, $http_method); + + $response = curl_exec($this->curl); + $response_decoded = $this->decode_response($response); + $response = array('json' => $response, 'decoded'=>$response_decoded); + + $this->check_status($response,$uri); + + return $response; + } + + /** + * Check http status for safe return codes + * + * @throws SetteeRestClientException + */ + private function check_status($response,$uri) { + $resp_code = curl_getinfo($this->curl, CURLINFO_HTTP_CODE); + + if ($resp_code < 199 || $resp_code > 399 || !empty($response['decoded']->error)) { + $msg = "CouchDB returned: \"HTTP 1.1. $resp_code\". ERROR: " . $response['json'] . $uri; + throw new SetteeRestClientException($msg); + } + } + + /** + * @param $path + * Full path to a file (e.g. as returned by PHP's realpath function). + * @return void + */ + public function file_mime_type ($path) { + $ftype = 'application/octet-stream'; + + if (function_exists("finfo_file")) { + $finfo = new finfo(FILEINFO_MIME_TYPE | FILEINFO_SYMLINK); + $fres = $finfo->file($path); + if (is_string($fres) && !empty($fres)) { + $ftype = $fres; + } + } + + return $ftype; + } + + /** + * @param $content + * content of a file in a string buffer format. + * @return void + */ + public function content_mime_type ($content) { + $ftype = 'application/octet-stream'; + + if (function_exists("finfo_file")) { + $finfo = new finfo(FILEINFO_MIME_TYPE | FILEINFO_SYMLINK); + $fres = $finfo->buffer($content); + if (is_string($fres) && !empty($fres)) { + $ftype = $fres; + } + } + + return $ftype; + } + + + /** + * + * @param $json + * json-encoded response from CouchDB + * + * @return + * decoded PHP object + */ + private function decode_response($json) { + return json_decode($json); + } + + /** + * Get full URL from a partial one + */ + private function get_full_url($uri) { + // We do not want "/", "?", "&" and "=" separators to be encoded!!! + $uri = str_replace(array('%2F', '%3F', '%3D', '%26'), array('/', '?', '=', '&'), urlencode($uri)); + return $this->base_url . '/' . $uri; + } +} + +class SetteeRestClientException extends Exception {} + --- /dev/null +++ b/couchdb/settee/src/classes/SetteeServer.class.php @@ -1,1 +1,106 @@ +conn_url = rtrim($conn_url, ' /'); + $this->rest_client = SetteeRestClient::get_instance($this->conn_url); + } + + /** + * Create database + * + * @param $db + * Either a database object or a String name of the database. + * + * @return + * json string from the server. + * + * @throws SetteeCreateDatabaseException + */ + function create_db($db) { + if ($db instanceof SetteeDatabase) { + $db = $db->get_name(); + } + $ret = $this->rest_client->http_put($db); + if (!empty($ret['decoded']->error)) { + throw new SetteeDatabaseException("Could not create database: " . $ret["json"]); + } + return $ret['decoded']; + } + + /** + * Drop database + * + * @param $db + * Either a database object or a String name of the database. + * + * @return + * json string from the server. + * + * @throws SetteeDropDatabaseException + */ + function drop_db($db) { + if ($db instanceof SetteeDatabase) { + $db = $db->get_name(); + } + $ret = $this->rest_client->http_delete($db); + if (!empty($ret['decoded']->error)) { + throw new SetteeDatabaseException("Could not create database: " . $ret["json"]); + } + return $ret['decoded']; + } + + /** + * Instantiate a database object + * + * @param $dbname + * name of the newly created database + * + * @return SetteeDatabase + * new SetteeDatabase instance. + */ + function get_db($dbname) { + return new SetteeDatabase($this->conn_url, $dbname); + } + + + /** + * Return an array containing all databases + * + * @return Array + * an array of database names in the CouchDB instance + */ + function list_dbs() { + $ret = $this->rest_client->http_get('_all_dbs'); + if (!empty($ret['decoded']["error"])) { + throw new SetteeDatabaseException("Could not get list of databases: " . $ret["json"]); + } + return $ret['decoded']; + } + +} + +class SetteeServerErrorException extends Exception {} +class SetteeDatabaseException extends Exception {} +class SetteeWrongInputException extends Exception {} --- /dev/null +++ b/couchdb/settee/src/settee.php @@ -1,1 +1,6 @@ + sudo upgrade pear + > sudo pear channel-discover pear.phpunit.de + > sudo pear install phpunit/PHPUnit +2. You need PHP 5.3.2 or later to run some tests that deal with private or protected methods. If you use an earlier + version of PHP, these tests will be skipped. + +3. Run all tests with: + > phpunit . --- /dev/null +++ b/couchdb/settee/tests/SetteeDatabaseTest.php @@ -1,1 +1,281 @@ - +db = $this->server->get_db($dbname); + $this->server->create_db($this->db); + } + + public function test_document_lifecycle_objectbased() { + $doc = new StdClass(); + $doc->firstName = "Irakli"; + $doc->lastName = "Nadareishvili"; + $doc->IQ = 200; + $doc->hobbies = array("skiing", "swimming"); + $doc->pets = array ("whitey" => "labrador", "mikey" => "pug"); + + $doc = $this->db->save($doc); + $this->assertTrue(!empty($doc->_id) && !empty($doc->_rev), "Document creation success [object-based]"); + + $_rev = $doc->_rev; + $doc = $this->db->get($doc->_id); + $this->assertEquals($_rev, $doc->_rev, "Document retrieval success [object-based] test"); + + $doc->firstName = "Ika"; + $db_doc = $this->db->save($doc); + $this->assertEquals($doc->firstName, $db_doc->firstName, "Document update success [object-based]"); + + $this->db->delete($doc); + + + try { + $doc = $this->db->get($doc->_id); + } catch (SetteeRestClientException $e) { + // we expect exception to fire, so this is good. + return; + } + + $this->fail('Document still available for retrieval after being deleted. [object-based]'); + } + + // Should work with json string as well: + // + + + public function test_document_lifecycle_jsonbased() { + $doc = '{"firstName":"Irakli","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}'; + + $doc = $this->db->save($doc); + $this->assertTrue(!empty($doc->_id) && !empty($doc->_rev), "Document creation success [json-based]"); + + $_rev = $doc->_rev; + + $db_doc = $this->db->get($doc->_id); + $this->assertEquals($_rev, $db_doc->_rev, "Document retrieval success [json-based] test"); + + $doc = '{'; + $doc .= '"_id":"' . $db_doc->_id . '",'; + $doc .= '"_rev":"' . $db_doc->_rev . '",'; + $doc .= '"firstName":"Ika","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}'; + + $orig_doc = json_decode($doc); + $db_doc = $this->db->save($doc); + $this->assertEquals($orig_doc->firstName, $db_doc->firstName, "Document update success [json-based]"); + + $doc = '{'; + $doc .= '"_id":"' . $db_doc->_id . '",'; + $doc .= '"_rev":"' . $db_doc->_rev . '",'; + $doc .= '"firstName":"Ika","lastName":"Nadareishvili","IQ":200,"hobbies":["skiing","swimming"],"pets":{"whitey":"labrador","mikey":"pug"}}'; + + $this->db->delete($doc); + + try { + $doc = $this->db->get($db_doc->_id); + } catch (SetteeRestClientException $e) { + // we expect exception to fire, so this is good. + return; + } + + $this->fail('Document still available for retrieval after being deleted. [object-based]'); + } + + public function test_invalid_document() { + $doc = 12345; + try { + $doc = $this->db->save($doc); + } catch (SetteeRestClientException $e) { + // we expect exception to fire, so this is good. + return; + } + + $this->fail('Document saved with invalid format'); + } + + public function test_get_rev() { + $doc = new stdClass(); + $doc->_id = "some_fixed_id"; + $doc = $this->db->save($doc); + + $_rev = $doc->_rev; + + $db_rev = $this->db->get_rev($doc->_id); + $this->assertEquals($_rev, $db_rev, "Document Revision retrieval success"); + + // _rev is now attached to this object due to last ->save() call + $doc->_id = "some_fixed_id"; + $doc->title = "Some Fixed ID"; + $doc = $this->db->save($doc); + + $_rev = $doc->_rev; + + $db_rev = $this->db->get_rev($doc->_id); + $this->assertEquals($_rev, $db_rev, "Document Revision retrieval success after re-save"); + + } + + public function test_save_auto_revision_detection() { + $doc = new stdClass(); + $doc->_id = "some_fixed_id"; + $this->db->save($doc); + + $doc = new stdClass(); + $doc->_id = "some_fixed_id"; + $doc->extra_field = "some other value"; + + $new_doc = $this->db->save($doc, true); + $this->assertEquals ($new_doc->extra_field, "some other value", "Testing auto-rev detection by save method"); + } + + public function test_inline_attachment_json() { + $doc = '{ + "_id":"attachment_doc", + "_attachments": + { + "foo.txt": + { + "content_type":"text\/plain", + "data": "VGhpcyBpcyBhIGJhc2U2NCBlbmNvZGVkIHRleHQ=" + } + } + }'; + $db_doc = $this->db->save($doc); + $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment save successful [json-based]"); + } + + public function test_inline_attachment_obj_content() { + $doc = new stdClass(); + $doc->_id = "attachment_doc"; + $this->db->add_attachment($doc, "foo.txt", "This is some text to be encoded", "text/plain"); + $db_doc = $this->db->save($doc); + $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment save successful [object-based]"); + + $doc = new stdClass(); + $doc->_id = "attachment_doc_autodetect"; + $this->db->add_attachment($doc, "foo.txt", "This is some other text to be encoded"); + $db_doc = $this->db->save($doc); + $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment save successful [object-based, mime auto-detection]"); + } + + public function test_inline_attachment_obj_file() { + $doc = new stdClass(); + $doc->_id = "attachment_doc"; + $file_path = dirname(__FILE__) . "/resources/couch-logo.pdf"; + $this->db->add_attachment_file($doc, "foo.pdf", $file_path, "application/pdf"); + $db_doc = $this->db->save($doc); + $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment of file successful"); + + $doc = new stdClass(); + $doc->_id = "attachment_doc_autodetect"; + $file_path = dirname(__FILE__) . "/resources/couch-logo.pdf"; + $this->db->add_attachment_file($doc, "foo.pdf", $file_path); + $db_doc = $this->db->save($doc); + $this->assertTrue(is_object($db_doc->_attachments), "Inline attachment of file successful w/ mime type auto-detection"); + } + + public function test_view_lifecycle() { + $this->_create_some_sample_docs(); + + $map_src = <<db->save_view("foo_views", "bar_view", $map_src); + $this->assertEquals("_design/foo_views", $view->_id, "View Creation Success"); + + $view = $this->db->get_view("foo_views", "bar_view"); + $this->assertEquals(3, $view->total_rows, "Running a View Success"); + + $map_src = <<db->save_view("foo_views", "bar_view", $map_src); + $this->assertEquals("_design/foo_views", $view->_id, "View Update Success"); + + $view = $this->db->get_view("foo_views", "bar_view"); + $this->assertEquals("Well hello and welcome to my new blog...", $view->rows[0]->value->body, "Running a View Success (after update)"); + + $view = $this->db->get_view("foo_views", "bar_view", "2009/02/17 21:13:39"); + $this->assertEquals("Bought a Cat", $view->rows[0]->value->title, "Running a Parametrized View"); + + $view = $this->db->get_view("foo_views", "bar_view", array("2009/01/30 18:04:11", "2009/02/17 21:13:39")); + $this->assertEquals("Biking", $view->rows[0]->value->title, "Running a Parametrized View with range"); + + $view = $this->db->get_view("foo_views", "bar_view", array("2009/02/17 21:13:39", "2009/01/30 18:04:11"), true); + $this->assertEquals("Bought a Cat", $view->rows[0]->value->title, "Running a Parametrized View with range, descending"); + $this->assertEquals(2, count($view->rows), "Running a Parametrized View with range, descending [count]"); + +} + + function test_two_views_in_a_design_doc() { + + $map_src = <<db->save_view("a_settee_design_doc", "foo_view", $map_src); + $this->assertTrue(isset($view->views->foo_view), "View1 Creation Success"); + + $view = $this->db->save_view("a_settee_design_doc", "bar_view", $map_src); + $this->assertTrue(isset($view->views->bar_view), "View2 Creation Success"); + } + + /** + * Create some sample docs for running tests on them. + * + *

This sample was taken from a wonderful book: + * CouchDB: The Definitive Guide (Animal Guide) by J. Chris Anderson, Jan Lehnardt and Noah Slater + * http://www.amazon.com/CouchDB-Definitive-Guide-Relax-Animal/dp/0596155891/ref=sr_1_1?ie=UTF8&qid=1311533443&sr=8-1 + * + * @return void + */ + private function _create_some_sample_docs() { + $doc = new stdClass(); + $doc->_id = "biking"; + $doc->title = "Biking"; + $doc->body = "My biggest hobby is mountainbiking"; + $doc->date = "2009/01/30 18:04:11"; + $this->db->save($doc); + + $doc = new stdClass(); + $doc->_id = "bought-a-cat"; + $doc->title = "Bought a Cat"; + $doc->body = "I went to the the pet store earlier and brought home a little kitty..."; + $doc->date = "2009/02/17 21:13:39"; + $this->db->save($doc); + + $doc = new stdClass(); + $doc->_id = "hello-world"; + $doc->title = "Hello World"; + $doc->body = "Well hello and welcome to my new blog..."; + $doc->date = "2009/01/15 15:52:20"; + $this->db->save($doc); + } + + public function tearDown() { + $ret = $this->server->drop_db($this->db); + } + +} + + --- /dev/null +++ b/couchdb/settee/tests/SetteeRestClientTest.php @@ -1,1 +1,90 @@ +rest_client = SetteeRestClient::get_instance($this->db_url); + } + + public function test_get_full_url() { + + //-- Can't run this test in PHP versions earlier than 5.3.2, which do not support ReflectionMethod class. + if (!class_exists('ReflectionMethod')) { + return; + } + + //-- Prepare for testing the private full_url_method method. + $get_full_url_method = new ReflectionMethod('SetteeRestClient', 'get_full_url'); + $get_full_url_method->setAccessible(TRUE); + + $uri = 'irakli/26cede9ab9cd8fcd67895eb05200d1ea'; + //-- Equivalent to: $calc = $this->rest_client->get_full_url($uri); but for a private method. + $calc = $get_full_url_method->invokeArgs($this->rest_client, array($uri)); + //-- + $expected = $this->db_url . '/irakli/26cede9ab9cd8fcd67895eb05200d1ea'; + $this->assertEquals($expected, $calc, "Full URL Generation with DB and ID"); + + $uri = 'irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267'; + $calc = $get_full_url_method->invokeArgs($this->rest_client, array($uri)); + $expected = $this->db_url . '/irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267'; + $this->assertEquals($expected, $calc, "Full URL Generation with DB, ID and Single Query Parameter"); + + $uri = 'irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267&second=foo'; + $calc = $get_full_url_method->invokeArgs($this->rest_client, array($uri)); + $expected = $this->db_url . '/irakli/26cede9ab9cd8fcd67895eb05200d1ea?rev=2-21587f7dffc43b4100f40168f309a267&second=foo'; + $this->assertEquals($expected, $calc, "Full URL Generation with DB, ID and Two Query Parameters"); + + } + + public function test_file_mime_type() { + + $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-logo.jpg"); + $this->assertEquals("image/jpeg", $type, "Jpeg Mime Type Detection"); + + $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-logo.pdf"); + $this->assertEquals("application/pdf", $type, "PDF Mime Type Detection"); + + + $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-logo.png"); + $this->assertEquals("image/png", $type, "PNG Mime Type Detection"); + + $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-tag.ini"); + $this->assertEquals("text/plain", $type, "Text Mime Type Detection"); + + $type = $this->rest_client->file_mime_type(dirname(__FILE__) . "/resources/couch-tag.xml"); + $this->assertEquals("application/xml", $type, "XML Mime Type Detection"); + } + + public function test_content_mime_type() { + $content = file_get_contents(dirname(__FILE__) . "/resources/couch-logo.jpg"); + $type = $this->rest_client->content_mime_type($content); + $this->assertEquals("image/jpeg", $type, "Jpeg Mime Type Detection"); + + $content = file_get_contents(dirname(__FILE__) . "/resources/couch-logo.pdf"); + $type = $this->rest_client->content_mime_type($content); + $this->assertEquals("application/pdf", $type, "PDF Mime Type Detection"); + + $content = file_get_contents(dirname(__FILE__) . "/resources/couch-logo.png"); + $type = $this->rest_client->content_mime_type($content); + $this->assertEquals("image/png", $type, "PNG Mime Type Detection"); + + $content = file_get_contents(dirname(__FILE__) . "/resources/couch-tag.ini"); + $type = $this->rest_client->content_mime_type($content); + $this->assertEquals("text/plain", $type, "Text Mime Type Detection"); + + $content = file_get_contents(dirname(__FILE__) . "/resources/couch-tag.xml"); + $type = $this->rest_client->content_mime_type($content); + $this->assertEquals("application/xml", $type, "XML Mime Type Detection"); + } + + + +} + + --- /dev/null +++ b/couchdb/settee/tests/SetteeServerTest.php @@ -1,1 +1,43 @@ +dbname = "settee_tests_" . md5(microtime(true)); + } + + public function test_database_lifecycle_namebased() { + $db = $this->server->get_db($this->dbname); + $ret = $this->server->create_db($this->dbname); + $this->assertTrue($ret->ok, "Database Creation Success Response [name-based]"); + + $database_list = $this->server->list_dbs(); + $this->assertTrue(is_array($database_list) && in_array($this->dbname, $database_list), + "Verifying Database in the List on the Server [name-based]"); + + $ret = $this->server->drop_db($this->dbname); + $this->assertTrue($ret->ok, "Database Deletion Success Response [name-based]"); + } + + public function test_database_lifecycle_objectbased() { + $db = $this->server->get_db($this->dbname); + $ret = $this->server->create_db($db); + $this->assertTrue($ret->ok, "Database Creation Success Response [object-based]"); + + $database_list = $this->server->list_dbs(); + $this->assertTrue(is_array($database_list) && in_array($this->dbname, $database_list), + "Verifying Database in the List on the Server [object-based]"); + + $ret = $this->server->drop_db($db); + $this->assertTrue($ret->ok, "Database Deletion Success Response [object-based]"); + } + +} + + --- /dev/null +++ b/couchdb/settee/tests/SetteeTestCase.class.php @@ -1,1 +1,20 @@ +db_url = isset($GLOBALS['db_url']) ? $GLOBALS['db_url'] : 'http://127.0.0.1:5984'; + $this->db_user = isset($GLOBALS['db_user']) ? $GLOBALS['db_user'] : 'admin'; + $this->db_pass = isset($GLOBALS['db_pass']) ? $GLOBALS['db_pass'] : 'admin'; + $this->server = new SetteeServer($this->db_url); + } + +} --- /dev/null +++ b/couchdb/settee/tests/phpunitConfig.xml @@ -1,1 +1,8 @@ + + + + + + + --- /dev/null +++ b/couchdb/settee/tests/resources/couch-tag.ini @@ -1,1 +1,2 @@ +Couchdb=relax --- /dev/null +++ b/couchdb/settee/tests/resources/couch-tag.xml @@ -1,1 +1,5 @@ + + +

CouchDB - Relax
+ --- /dev/null +++ b/documents/.gitignore @@ -1,1 +1,2 @@ +*.pyc --- /dev/null +++ b/documents/about.php @@ -1,1 +1,11 @@ + +

About

+ + --- /dev/null +++ b/documents/agency.php @@ -1,1 +1,41 @@ +get_db('disclosr-agencies'); +$idtoname = Array(); +foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { + $idtoname[$row->id] = trim($row->value->name); +} +$foidocsdb = $server->get_db('disclosr-foidocuments'); + +include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency')); +$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); +?> +
Read all the information released by Australian Federal Government agencies under the FOI Act in one place!
+RSS Icon All Agencies RSS Feed
+get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows; + foreach ($rows as $row) { + //print_r($rows); + echo displayLogEntry($row, $idtoname); + if (!isset($startkey)) + $startkey = $row->key; + $endkey = $row->key; + } + } else { + $rows = $foidocsdb->get_view("app", "byAgencyID?group=true", null, false, false, true)->rows; + if ($rows) { + foreach ($rows as $row) { + echo '' . $idtoname[$row->key] . " (" . $row->value . " records)
\n"; + } + } + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +echo "next page "; +include_footer_documents(); +?> --- a/documents/charts.php +++ b/documents/charts.php @@ -1,6 +1,6 @@ get_db('disclosr-agencies'); @@ -15,29 +15,28 @@

Charts

Lorem ipsum.

-
+
+
+
Read all the information released by Australian Federal Government agencies under the FOI Act in one place!
+RSS Icon All Agencies RSS Feed
+get_db('disclosr-agencies'); + +$idtoname = Array(); +foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { + $idtoname[$row->id] = trim($row->value->name); +} +$foidocsdb = $server->get_db('disclosr-foidocuments'); +try { + $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20)->rows; + if ($rows) { + foreach ($rows as $key => $row) { + echo displayLogEntry($row, $idtoname); + if (!isset($startkey)) $startkey = $row->key; + $endkey = $row->key; + } + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +echo "next page "; +*/ +include_footer_documents(); +?> + --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -1,53 +1,76 @@ Agency NameDisclosure Log URL recorded?Do we monitor this URL?"; $agenciesdb = $server->get_db('disclosr-agencies'); $docsdb = $server->get_db('disclosr-documents'); +$agencies = 0; +$disclogs = 0; +$red = 0; +$green = 0; +$yellow = 0; +$orange = 0; try { $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows; if ($rows) { foreach ($rows as $row) { + if ((!isset($row->value->status) || $row->value->status != "suspended") && isset($row->value->foiEmail)) { + echo ""; + if (isset($row->value->website)) echo ""; + echo "" . $row->value->name . ""; + if (isset($row->value->website)) echo ""; + if ($ENV == "DEV") + echo "
(" . $row->id . ")"; + echo "\n"; + $agencies++; - echo "" . $row->value->name . ""; - if ($ENV == "DEV") - echo "
(" . $row->id . ")"; - echo "\n"; - - - echo ""; - if (isset($row->value->FOIDocumentsURL)) { - echo '' - . $row->value->FOIDocumentsURL . ''; - if ($ENV == "DEV") - echo '
(' - . 'view local copy)'; - } else { - echo "✘"; + echo ""; + if (isset($row->value->FOIDocumentsURL)) { + $disclogs++; + echo '' + . $row->value->FOIDocumentsURL . ''; + if ($ENV == "DEV") + echo '
(' + . 'view local copy)'; + } else { + echo "✘"; + } + echo "\n"; + if (isset($row->value->FOIDocumentsURL)) { + if (file_exists("./scrapers/" . $row->id . '.py')) { + echo "✔"; + $green++; + } else if (file_exists("./scrapers/" . $row->id . '.txt')) { + if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") { + echo "◎"; + $yellow++; + } else { + echo file_get_contents("./scrapers/" . $row->id . '.txt'); + echo "▬"; + $orange++; + } + } else { + echo "✘"; + $red++; + } + } + echo "\n"; } - echo "\n"; - if (isset($row->value->FOIDocumentsURL)) { - if (file_exists("./scrapers/" . $row->id . '.py')) { - echo "✔"; - } else if (file_exists("./scrapers/" . $row->id . '.txt')) { - echo "▬"; - } else { - echo "✘"; - } - } - echo "\n"; } } } catch (SetteeRestClientException $e) { setteErrorHandler($e); } echo ""; +echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; " + . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers "; + include_footer_documents(); ?> --- /dev/null +++ b/documents/disclosr-documents.nja @@ -1,1 +1,7 @@ - +{ + "venv": "", + "project-type": "Import from sources", + "name": "disclosr-documents", + "license": "GNU General Public License v3", + "description": "" +} --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,143 +1,281 @@ -import sys,os +import sys +import os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup from time import mktime import feedparser import abc -import unicodedata, re +import unicodedata +import re import dateutil from dateutil.parser import * from datetime import * +import codecs + +import difflib + +from StringIO import StringIO + +from pdfminer.pdfparser import PDFDocument, PDFParser +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf +from pdfminer.pdfdevice import PDFDevice, TagExtractor +from pdfminer.converter import TextConverter +from pdfminer.cmapdb import CMapDB +from pdfminer.layout import LAParams + class GenericDisclogScraper(object): - __metaclass__ = abc.ABCMeta - agencyID = None - disclogURL = None - def remove_control_chars(self, input): - return "".join([i for i in input if ord(i) in range(32, 127)]) - def getAgencyID(self): - """ disclosr agency id """ - if self.agencyID == None: - self.agencyID = os.path.basename(sys.argv[0]).replace(".py","") - return self.agencyID - - def getURL(self): - """ disclog URL""" - if self.disclogURL == None: - agency = scrape.agencydb.get(self.getAgencyID()) - self.disclogURL = agency['FOIDocumentsURL'] - return self.disclogURL - - @abc.abstractmethod - def doScrape(self): - """ do the scraping """ - return - - @abc.abstractmethod - def getDescription(self, content, entry, doc): - """ get description""" - return - + __metaclass__ = abc.ABCMeta + agencyID = None + disclogURL = None + + def remove_control_chars(self, input): + return "".join([i for i in input if ord(i) in range(32, 127)]) + + def getAgencyID(self): + """ disclosr agency id """ + if self.agencyID is None: + self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "") + return self.agencyID + + def getURL(self): + """ disclog URL""" + if self.disclogURL is None: + agency = scrape.agencydb.get(self.getAgencyID()) + self.disclogURL = agency['FOIDocumentsURL'] + return self.disclogURL + + @abc.abstractmethod + def doScrape(self): + """ do the scraping """ + return + +class GenericHTMLDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + content = rcontent + dochash = scrape.mkhash(content) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries" + last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) + if last_attach != None: + html_diff = difflib.HtmlDiff() + description = description + "\nChanges: " + description = description + html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} + foidocsdb.save(doc) + else: + print "already saved" + +class GenericPDFDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + laparams = LAParams() + rsrcmgr = PDFResourceManager(caching=True) + outfp = StringIO() + device = TextConverter(rsrcmgr, outfp, codec='utf-8', + laparams=laparams) + fp = StringIO() + fp.write(content) + + process_pdf(rsrcmgr, device, fp, set(), caching=True, + check_extractable=True) + description = outfp.getvalue() + fp.close() + device.close() + outfp.close() + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} + foidocsdb.save(doc) + else: + print "already saved" + + +class GenericDOCXDisclogScraper(GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb + , self.getURL(), "foidocuments", self.getAgencyID()) + mydoc = zipfile.ZipFile(file) + xmlcontent = mydoc.read('word/document.xml') + document = etree.fromstring(xmlcontent) + ## Fetch all the text out of the document we just created + paratextlist = getdocumenttext(document) + # Make explicit unicode version + newparatextlist = [] + for paratext in paratextlist: + newparatextlist.append(paratext.encode("utf-8")) + ## Print our documnts test with two newlines under each paragraph + description = '\n\n'.join(newparatextlist).strip(' \t\n\r') + dochash = scrape.mkhash(description) + doc = foidocsdb.get(dochash) + + if doc is None: + print "saving " + dochash + edate = time().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} + foidocsdb.save(doc) + else: + print "already saved" class GenericRSSDisclogScraper(GenericDisclogScraper): - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - feed = feedparser.parse(content) - for entry in feed.entries: - #print entry - print entry.id - hash = scrape.mkhash(entry.id) - #print hash - doc = foidocsdb.get(hash) - #print doc - if doc == None: - print "saving "+ hash - edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id, - "date": edate,"title": entry.title} - self.getDescription(entry,entry, doc) - foidocsdb.save(doc) + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + dochash = scrape.mkhash(entry.id) + doc = foidocsdb.get(dochash) + #print doc + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp( + mktime(entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), + 'url': entry.link, 'docID': entry.id, + "date": edate, "title": entry.title} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + doc.update({'description': content.summary}) + return + + +class GenericOAICDisclogScraper(GenericDisclogScraper): + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def getColumns(self, columns): + """ rearranges columns if required """ + return + + def getColumnCount(self): + return 5 + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + descriptiontxt = "" + for string in content.stripped_strings: + descriptiontxt = descriptiontxt + " \n" + string + doc.update({'description': descriptiontxt}) + + def getTitle(self, content, entry, doc): + doc.update({'title': (''.join(content.stripped_strings))}) + + def getTable(self, soup): + return soup.table + + def getRows(self, table): + return table.find_all('tr') + + def getDate(self, content, entry, doc): + date = ''.join(content.stripped_strings).strip() + (a, b, c) = date.partition("(") + date = self.remove_control_chars(a.replace("Octber", "October")) + print date + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + + def getLinks(self, content, entry, doc): + links = [] + for atag in entry.find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(content, atag['href'])) + if links != []: + doc.update({'links': links}) + return + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + if content is not None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + print "parsing" + soup = BeautifulSoup(content) + table = self.getTable(soup) + for row in self.getRows(table): + columns = row.find_all('td') + if len(columns) is self.getColumnCount(): + (id, date, title, + description, notes) = self.getColumns(columns) + print self.remove_control_chars( + ''.join(id.stripped_strings)) + if id.string is None: + dochash = scrape.mkhash( + self.remove_control_chars( + url + (''.join(date.stripped_strings)))) else: - print "already saved" - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - doc.update({'description': content.summary}) - return - -class GenericOAICDisclogScraper(GenericDisclogScraper): - __metaclass__ = abc.ABCMeta - @abc.abstractmethod - def getColumns(self,columns): - """ rearranges columns if required """ - return - def getColumnCount(self): - return 5 - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - descriptiontxt = "" - for string in content.stripped_strings: - descriptiontxt = descriptiontxt + " \n" + string - doc.update({'description': descriptiontxt}) - return - def getTitle(self, content, entry, doc): - doc.update({'title': content.string}) - return - def getTable(self, soup): - return soup.table - def getDate(self, content, entry, doc): - edate = parse(''.join(content.stripped_strings).strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") - print edate - doc.update({'date': edate}) - return - def getLinks(self, content, entry, doc): - links = [] - for atag in entry.find_all("a"): - if atag.has_key('href'): - links.append(scrape.fullurl(content,atag['href'])) - if links != []: - doc.update({'links': links}) - return - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url,mime_type,content) = scrape.fetchURL(scrape.docsdb, self.getURL(), "foidocuments", self.getAgencyID()) - if content != None: - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(content) - table = self.getTable(soup) - for row in table.find_all('tr'): - columns = row.find_all('td') - if len(columns) == self.getColumnCount(): - (id, date, description, title, notes) = self.getColumns(columns) - print ''.join(id.stripped_strings) - if id.string == None: - hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) - else: - hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings)))) - doc = foidocsdb.get(hash) - - if doc == None: - print "saving " +hash - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} - self.getLinks(self.getURL(),row,doc) - self.getTitle(title,row, doc) - self.getDate(date,row, doc) - self.getDescription(description,row, doc) - if notes != None: - doc.update({ 'notes': notes.string}) - foidocsdb.save(doc) - else: - print "already saved "+hash - - elif len(row.find_all('th')) == self.getColumnCount(): - print "header row" - - else: - print "ERROR number of columns incorrect" - print row - + dochash = scrape.mkhash( + self.remove_control_chars( + url + (''.join(id.stripped_strings)))) + doc = foidocsdb.get(dochash) + + if doc is None: + print "saving " + dochash + doc = {'_id': dochash, + 'agencyID': self.getAgencyID(), + 'url': self.getURL(), + 'docID': (''.join(id.stripped_strings))} + self.getLinks(self.getURL(), row, doc) + self.getTitle(title, row, doc) + self.getDate(date, row, doc) + self.getDescription(description, row, doc) + if notes is not None: + doc.update({ 'notes': ( + ''.join(notes.stripped_strings))}) + badtitles = ['-','Summary of FOI Request' + , 'FOI request(in summary form)' + , 'Summary of FOI request received by the ASC', +'Summary of FOI request received by agency/minister', +'Description of Documents Requested','FOI request', +'Description of FOI Request','Summary of request','Description','Summary', +'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67'] + if doc['title'] not in badtitles\ + and doc['description'] != '': + print "saving" + foidocsdb.save(doc) + else: + print "already saved " + dochash + + elif len(row.find_all('th')) is self.getColumnCount(): + print "header row" + + else: + print "ERROR number of columns incorrect" + print row + --- a/documents/index.php +++ b/documents/index.php @@ -2,11 +2,12 @@ include('template.inc.php'); include_header_documents(""); include_once('../include/common.inc.php'); +$endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); +$enddocid = (isset($_REQUEST['end_docid']) ? $_REQUEST['end_docid'] : null); ?> +
Read all the information released by Australian Federal Government agencies under the FOI Act in one place!
+RSS Icon All Agencies RSS Feed
get_db('disclosr-agencies'); $idtoname = Array(); @@ -15,17 +16,20 @@ } $foidocsdb = $server->get_db('disclosr-foidocuments'); try { - $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows; - - + $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20,null, $enddocid)->rows; if ($rows) { - foreach ($rows as $row) { -displayLogEntry($row,$idtoname); + foreach ($rows as $key => $row) { + echo displayLogEntry($row, $idtoname); + if (!isset($startkey)) + $startkey = $row->key; + $endkey = $row->key; + $enddocid = $row->value->_id; } } } catch (SetteeRestClientException $e) { setteErrorHandler($e); } +echo "next page "; include_footer_documents(); ?> --- a/documents/robots.txt +++ b/documents/robots.txt @@ -2,4 +2,5 @@ # http://code.google.com/web/controlcrawlindex/ User-agent: * - +Disallow: /admin/ +Sitemap: http://disclosurelo.gs/sitemap.xml.php --- a/documents/rss.xml.php +++ b/documents/rss.xml.php @@ -3,28 +3,48 @@ // Agency X updated Y, new files, diff of plain text/link text, // feed for just one agency or all // This is a minimum example of using the Universal Feed Generator Class -include("lib/FeedWriter.php"); +include("../lib/FeedWriter/FeedTypes.php"); +include_once('../include/common.inc.php'); //Creating an instance of FeedWriter class. -$TestFeed = new FeedWriter(RSS2); +$TestFeed = new RSS2FeedWriter(); //Setting the channel elements +////Retriving informations from database +$idtoname = Array(); +$agenciesdb = $server->get_db('disclosr-agencies'); +foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { + $idtoname[$row->id] = trim($row->value->name); +} +$foidocsdb = $server->get_db('disclosr-foidocuments'); +if (isset($_REQUEST['id'])) { + $rows = $foidocsdb->get_view("app", "byAgencyID", $_REQUEST['id'], false, false, false)->rows; + $title = $idtoname[$_REQUEST['id']]; +} else { + $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99', '0000-00-00', 50), true)->rows; + $title = 'All Agencies'; +} //Use wrapper functions for common channelelements -$TestFeed->setTitle('Last Modified - All'); -$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php'); -$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer'); -//Retriving informations from database -$rows = $db->get_view("app", "byLastModified")->rows; +$TestFeed->setTitle('disclosurelo.gs Newest Entries - '.$title); +$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php'.(isset($_REQUEST['id'])? '?id='.$_REQUEST['id'] : '')); +$TestFeed->setDescription('disclosurelo.gs Newest Entries - '.$title); +$TestFeed->setChannelElement('language', 'en-us'); +$TestFeed->setChannelElement('pubDate', date(DATE_RSS, time())); + + //print_r($rows); foreach ($rows as $row) { //Create an empty FeedItem $newItem = $TestFeed->createNewItem(); //Add elements to the feed item - $newItem->setTitle($row['name']); - $newItem->setLink($row['id']); - $newItem->setDate(date("c", $row['metadata']['lastModified'])); - $newItem->setDescription($row['name']); + $newItem->setTitle($row->value->title); + $newItem->setLink("http://disclosurelo.gs/view.php?id=" . $row->value->_id); + $newItem->setDate(strtotime($row->value->date)); + $newItem->setDescription(displayLogEntry($row, $idtoname)); + $newItem->setAuthor($idtoname[$row->value->agencyID]); + $newItem->addElement('guid', "http://disclosurelo.gs/view.php?id=" . $row->value->_id, array('isPermaLink' => 'true')); //Now add the feed item $TestFeed->addItem($newItem); } //OK. Everything is done. Now genarate the feed. -$TestFeed->genarateFeed(); +$TestFeed->generateFeed(); ?> + --- /dev/null +++ b/documents/runScrapers.sh @@ -1,1 +1,10 @@ +for f in scrapers/*.py; + do echo "Processing $f file.."; + python $f; + if [ "$?" -ne "0" ]; then + echo "error"; + sleep 2; + fi +done + --- a/documents/scrape.py +++ b/documents/scrape.py @@ -8,186 +8,198 @@ import time import os import mimetypes -import re import urllib import urlparse def mkhash(input): - return hashlib.md5(input).hexdigest().encode("utf-8") + return hashlib.md5(input).hexdigest().encode("utf-8") def canonurl(url): - r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' - if the URL looks invalid. - >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws - 'http://xn--hgi.ws/' - """ - # strip spaces at the ends and ensure it's prefixed with 'scheme://' - url = url.strip() - if not url: - return '' - if not urlparse.urlsplit(url).scheme: - url = 'http://' + url - - # turn it into Unicode - #try: - # url = unicode(url, 'utf-8') - #except UnicodeDecodeError: - # return '' # bad UTF-8 chars in URL - - # parse the URL into its components - parsed = urlparse.urlsplit(url) - scheme, netloc, path, query, fragment = parsed - - # ensure scheme is a letter followed by letters, digits, and '+-.' chars - if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): - return '' - scheme = str(scheme) - - # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] - match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) - if not match: - return '' - domain, port = match.groups() - netloc = domain + (port if port else '') - netloc = netloc.encode('idna') - - # ensure path is valid and convert Unicode chars to %-encoded - if not path: - path = '/' # eg: 'http://google.com' -> 'http://google.com/' - path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') - - # ensure query is valid - query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') - - # ensure fragment is valid - fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) - - # piece it all back together, truncating it to a maximum of 4KB - url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) - return url[:4096] + r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' + if the URL looks invalid. + >>> canonurl('\xe2\x9e\xa1.ws') # tinyarro.ws + 'http://xn--hgi.ws/' + """ + # strip spaces at the ends and ensure it's prefixed with 'scheme://' + url = url.strip() + if not url: + return '' + if not urlparse.urlsplit(url).scheme: + url = 'http://' + url + + # turn it into Unicode + #try: + # url = unicode(url, 'utf-8') + #except UnicodeDecodeError: + # return '' # bad UTF-8 chars in URL + + # parse the URL into its components + parsed = urlparse.urlsplit(url) + scheme, netloc, path, query, fragment = parsed + + # ensure scheme is a letter followed by letters, digits, and '+-.' chars + if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I): + return '' + scheme = str(scheme) + + # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port] + match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I) + if not match: + return '' + domain, port = match.groups() + netloc = domain + (port if port else '') + netloc = netloc.encode('idna') + + # ensure path is valid and convert Unicode chars to %-encoded + if not path: + path = '/' # eg: 'http://google.com' -> 'http://google.com/' + path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;') + + # ensure query is valid + query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/') + + # ensure fragment is valid + fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8'))) + + # piece it all back together, truncating it to a maximum of 4KB + url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) + return url[:4096] def fullurl(url,href): - href = href.replace(" ","%20") - href = re.sub('#.*$','',href) - return urljoin(url,href) + href = href.replace(" ","%20") + href = re.sub('#.*$','',href) + return urljoin(url,href) #http://diveintopython.org/http_web_services/etags.html -class NotModifiedHandler(urllib2.BaseHandler): - def http_error_304(self, req, fp, code, message, headers): - addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) - addinfourl.code = code - return addinfourl +class NotModifiedHandler(urllib2.BaseHandler): + def http_error_304(self, req, fp, code, message, headers): + addinfourl = urllib2.addinfourl(fp, headers, req.get_full_url()) + addinfourl.code = code + return addinfourl + +def getLastAttachment(docsdb,url): + hash = mkhash(url) + doc = docsdb.get(hash) + if doc != None: + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc,last_attachment_fname) + return last_attachment + else: + return None def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): - url = canonurl(url) - hash = mkhash(url) - req = urllib2.Request(url) - print "Fetching %s (%s)" % (url,hash) - if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": - print "Not a valid HTTP url" - return (None,None,None) - doc = docsdb.get(hash) - if doc == None: - doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} - else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): - print "Uh oh, trying to scrape URL again too soon!" - last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - content = last_attachment - return (doc['url'],doc['mime_type'],content) - if scrape_again == False: - print "Not scraping this URL again as requested" - return (None,None,None) - - time.sleep(3) # wait 3 seconds to give webserver time to recover - - req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") - #if there is a previous version stored in couchdb, load caching helper tags - if doc.has_key('etag'): - req.add_header("If-None-Match", doc['etag']) - if doc.has_key('last_modified'): - req.add_header("If-Modified-Since", doc['last_modified']) - - opener = urllib2.build_opener(NotModifiedHandler()) - try: - url_handle = opener.open(req) - doc['url'] = url_handle.geturl() # may have followed a redirect to a new url - headers = url_handle.info() # the addinfourls have the .info() too - doc['etag'] = headers.getheader("ETag") - doc['last_modified'] = headers.getheader("Last-Modified") - doc['date'] = headers.getheader("Date") - doc['page_scraped'] = time.time() - doc['web_server'] = headers.getheader("Server") - doc['via'] = headers.getheader("Via") - doc['powered_by'] = headers.getheader("X-Powered-By") - doc['file_size'] = headers.getheader("Content-Length") - content_type = headers.getheader("Content-Type") - if content_type != None: - doc['mime_type'] = content_type.split(";")[0] - else: - (type,encoding) = mimetypes.guess_type(url) - doc['mime_type'] = type - if hasattr(url_handle, 'code'): - if url_handle.code == 304: - print "the web page has not been modified" - return (None,None,None) - else: - content = url_handle.read() - docsdb.save(doc) - doc = docsdb.get(hash) # need to get a _rev - docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) - return (doc['url'], doc['mime_type'], content) - #store as attachment epoch-filename - - except urllib2.URLError as e: - error = "" - if hasattr(e, 'reason'): - error = "error %s in downloading %s" % (str(e.reason), url) - elif hasattr(e, 'code'): - error = "error %s in downloading %s" % (e.code, url) - print error - doc['error'] = error - docsdb.save(doc) - return (None,None,None) + url = canonurl(url) + hash = mkhash(url) + req = urllib2.Request(url) + print "Fetching %s (%s)" % (url,hash) + if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": + print "Not a valid HTTP url" + return (None,None,None) + doc = docsdb.get(hash) + if doc == None: + doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} + else: + if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): + print "Uh oh, trying to scrape URL again too soon!"+hash + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc,last_attachment_fname) + content = last_attachment + return (doc['url'],doc['mime_type'],content.read()) + if scrape_again == False: + print "Not scraping this URL again as requested" + return (doc['url'],doc['mime_type'],content.read()) + + req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") + #if there is a previous version stored in couchdb, load caching helper tags + if doc.has_key('etag'): + req.add_header("If-None-Match", doc['etag']) + if doc.has_key('last_modified'): + req.add_header("If-Modified-Since", doc['last_modified']) + + opener = urllib2.build_opener(NotModifiedHandler()) + try: + url_handle = opener.open(req) + doc['url'] = url_handle.geturl() # may have followed a redirect to a new url + headers = url_handle.info() # the addinfourls have the .info() too + doc['etag'] = headers.getheader("ETag") + doc['last_modified'] = headers.getheader("Last-Modified") + doc['date'] = headers.getheader("Date") + doc['page_scraped'] = time.time() + doc['web_server'] = headers.getheader("Server") + doc['via'] = headers.getheader("Via") + doc['powered_by'] = headers.getheader("X-Powered-By") + doc['file_size'] = headers.getheader("Content-Length") + content_type = headers.getheader("Content-Type") + if content_type != None: + doc['mime_type'] = content_type.split(";")[0] + else: + (type,encoding) = mimetypes.guess_type(url) + doc['mime_type'] = type + if hasattr(url_handle, 'code'): + if url_handle.code == 304: + print "the web page has not been modified"+hash + last_attachment_fname = doc["_attachments"].keys()[-1] + last_attachment = docsdb.get_attachment(doc,last_attachment_fname) + content = last_attachment + return (doc['url'],doc['mime_type'],content.read()) + else: + print "new webpage loaded" + content = url_handle.read() + docsdb.save(doc) + doc = docsdb.get(hash) # need to get a _rev + docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) + return (doc['url'], doc['mime_type'], content) + #store as attachment epoch-filename + + except urllib2.URLError as e: + print "error!" + error = "" + if hasattr(e, 'reason'): + error = "error %s in downloading %s" % (str(e.reason), url) + elif hasattr(e, 'code'): + error = "error %s in downloading %s" % (e.code, url) + print error + doc['error'] = error + docsdb.save(doc) + return (None,None,None) def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): - (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) - badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] - if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(content) - navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) - for nav in navIDs: - print "Removing element", nav['id'] - nav.extract() - navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) - for nav in navClasses: - print "Removing element", nav['class'] - nav.extract() - links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) - linkurls = set([]) - for link in links: - if link.has_key("href"): - if link['href'].startswith("http"): - # lets not do external links for now - # linkurls.add(link['href']) - None - if link['href'].startswith("mailto"): - # not http - None - if link['href'].startswith("javascript"): - # not http - None - else: - # remove anchors and spaces in urls - linkurls.add(fullurl(url,link['href'])) - for linkurl in linkurls: - #print linkurl - scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) + (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) + badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] + if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(content) + navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) + for nav in navIDs: + print "Removing element", nav['id'] + nav.extract() + navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) + for nav in navClasses: + print "Removing element", nav['class'] + nav.extract() + links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) + linkurls = set([]) + for link in links: + if link.has_key("href"): + if link['href'].startswith("http"): + # lets not do external links for now + # linkurls.add(link['href']) + None + if link['href'].startswith("mailto"): + # not http + None + if link['href'].startswith("javascript"): + # not http + None + else: + # remove anchors and spaces in urls + linkurls.add(fullurl(url,link['href'])) + for linkurl in linkurls: + #print linkurl + scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') couch = couchdb.Server('http://127.0.0.1:5984/') @@ -196,20 +208,20 @@ docsdb = couch['disclosr-documents'] if __name__ == "__main__": - for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? - agency = agencydb.get(row.id) - print agency['name'] - for key in agency.keys(): - if key == "FOIDocumentsURL" and "status" not in agency.keys: - scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) - if key == 'website' and False: - scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) - if key.endswith('URL') and False: - print key - depth = 1 - if 'scrapeDepth' in agency.keys(): - depth = agency['scrapeDepth'] - scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) - agency['metadata']['lastScraped'] = time.time() - agencydb.save(agency) - + for row in agencydb.view('app/getScrapeRequired'): #not recently scraped agencies view? + agency = agencydb.get(row.id) + print agency['name'] + for key in agency.keys(): + if key == "FOIDocumentsURL" and "status" not in agency.keys: + scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + if key == 'website' and False: + scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + agency['metadata']['lastScraped'] = time.time() + if key.endswith('URL') and False: + print key + depth = 1 + if 'scrapeDepth' in agency.keys(): + depth = agency['scrapeDepth'] + scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) + agencydb.save(agency) + --- /dev/null +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py @@ -1,1 +1,48 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import traceback +try: + import amonpy + amonpy.config.address = 'http://amon_instance:port' + amonpy.config.secret_key = 'the secret key from /etc/amon.conf' + amon_available = True +except ImportError: + amon_available = False +class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericPDFDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericPDFDisclogScraper) + try: + ScraperImplementation().doScrape() + except Exception, err: + sys.stderr.write('ERROR: %s\n' % str(err)) + print ‘Error Reason: ‘, err.__doc__ + print ‘Exception: ‘, err.__class__ + print traceback.format_exc() + if amon_available: + data = { + 'exception_class': '', + 'url': '', + 'backtrace': ['exception line ', 'another exception line'], + 'enviroment': '', + + # In 'data' you can add request information, session variables - it's a recursive + # dictionary, so you can literally add everything important for your specific case + # The dictionary doesn't have a specified structure, the keys below are only example + 'data': {'request': '', 'session': '', 'more': ''} + + } + + amonpy.exception(data) + pass + --- /dev/null +++ b/documents/scrapers/00a294de663db69062ca09aede7c0487.py @@ -1,1 +1,47 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import dateutil +from dateutil.parser import * +from datetime import * + +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + def getDate(self, content, entry, doc): + date = ''.join(entry.find('th').stripped_strings).strip() + (a, b, c) = date.partition("(") + date = self.remove_control_chars(a.replace("Octber", "October")) + print date + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + def getColumnCount(self): + return 4 + + def getTable(self, soup): + return soup.find(summary="List of Defence documents released under Freedom of Information requets") + + def getColumns(self, columns): + (id, description, access, notes) = columns + return (id, None, description, description, notes) + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + + nsi = ScraperImplementation() + nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201213.cfm" + nsi.doScrape() + + nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201112.cfm" + nsi.doScrape() + + nsi.disclogURL = "http://www.defence.gov.au/foi/disclosure_log_201011.cfm" + nsi.doScrape() + + --- /dev/null +++ b/documents/scrapers/0324e4b1654fd6dd651307abcef67094.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description, notes,link) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/0372b19123076338d483f624c433727b.txt @@ -1,1 +1,2 @@ +docx --- /dev/null +++ b/documents/scrapers/0ae822d1a748e60d90f0b79b97d5a3e5.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/0ced9dd2de36100c3cabdb7fd8e843a9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/1097fa8afdcf5db89d212d0979226667.py +++ b/documents/scrapers/1097fa8afdcf5db89d212d0979226667.py @@ -8,7 +8,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- a/documents/scrapers/157cbe654bdaa0a48e6650152ae34489.py +++ b/documents/scrapers/157cbe654bdaa0a48e6650152ae34489.py @@ -10,7 +10,7 @@ return 5 def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getTable(self,soup): return soup.find_all('table')[4] --- /dev/null +++ b/documents/scrapers/1803322b27286950cab0c543168b5f21.py @@ -1,1 +1,58 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import dateutil +from dateutil.parser import * +from datetime import * +import scrape +from bs4 import BeautifulSoup +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def __init__(self): + super(ScraperImplementation, self).__init__() + + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(), atag['href']) + (url, mime_type, htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + soup = BeautifulSoup(htcontent) + row = soup.find(id="content_div_148050") + description = ''.join(row.stripped_strings) + for atag in row.find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link, atag['href'])) + + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + def getColumnCount(self): + return 4 + + def getColumns(self, columns): + (id, date, datepub, title) = columns + return (id, date, title, title, None) + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + + nsi = ScraperImplementation() + nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=1" + nsi.doScrape() + nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=2" + nsi.doScrape() + nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=3" + nsi.doScrape() + nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=4" + nsi.doScrape() + nsi.disclogURL = "http://www.dbcde.gov.au/about_us/freedom_of_information_disclosure_log/foi_list?result_146858_result_page=5" + nsi.doScrape() + --- /dev/null +++ b/documents/scrapers/1d404c4934f74feacd00dcb434e7c10a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "cphMain_C001_Col01").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py +++ b/documents/scrapers/227cb6eb7d2c9f8a6e846df7447d6caa.py @@ -21,9 +21,10 @@ for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): if row != None: rowtitle = row.find('th').string - description = description + "\n" + rowtitle + ": " + if rowtitle != None: + description = description + "\n" + rowtitle + ": " for text in row.find('td').stripped_strings: - description = description + text + description = description + text for atag in row.find_all("a"): if atag.has_key('href'): links.append(scrape.fullurl(link,atag['href'])) --- /dev/null +++ b/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "inner_content") + def getColumnCount(self): + return 2 + def getColumns(self,columns): + (date, title) = columns + return (date, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/2cac2cd1f42687db2d04fa20b5b6a538.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, title, date) = columns + return (id, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/31685505438d393f45a90f442b8fa27f.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericPDFDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericPDFDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py @@ -1,1 +1,32 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + def getTitle(self, content, entry, doc): + i = 0 + title = "" + for string in content.stripped_strings: + if i < 2: + title = title + string + i = i+1 + doc.update({'title': title}) + print title + return + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/3b54190e3f409380e109fae29e1917aa.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description, link, deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py +++ b/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, description, title, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py @@ -1,1 +1,47 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "inner-column").table + def getRows(self,table): + return table.tbody.find_all('tr',recursive=False) + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + def getDate(self, content, entry, doc): + i = 0 + date = "" + for string in content.stripped_strings: + if i ==1: + date = string + i = i+1 + edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return + def getTitle(self, content, entry, doc): + i = 0 + title = "" + for string in content.stripped_strings: + if i < 2: + title = title + string + i = i+1 + doc.update({'title': title}) + #print title + return + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/3e2f110af49d62833a835bd257771ffb.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/41a166419503bb50e410c58be54c102f.py @@ -1,1 +1,27 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +from datetime import date +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id= "ctl00_MSO_ContentDiv").table + + def getColumns(self,columns): + (id, title, description, notes) = columns + return (id, title, title, description, notes) + def getDate(self, content, entry, doc): + edate = date.today().strftime("%Y-%m-%d") + doc.update({'date': edate}) + return + def getColumnCount(self): + return 4 + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/4934000fddd6a5b1094f398798341290.py @@ -1,1 +1,23 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + --- /dev/null +++ b/documents/scrapers/4c57389dda9bd454bcb08bc1e5ed87bf.txt @@ -1,1 +1,2 @@ +parent --- /dev/null +++ b/documents/scrapers/4d2af2dcc72f1703bbf04b13b03720a8.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/50601505ef69483121a6d130bb0515e4.txt +++ b/documents/scrapers/50601505ef69483121a6d130bb0515e4.txt @@ -1,1 +1,1 @@ -apsc has ACMA style disclog +ACMA style --- /dev/null +++ b/documents/scrapers/525c3953187da08cd702359b2fc2997f.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/53b14397c8f27c29ff07b6319f7a0ec5.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py +++ b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py @@ -10,7 +10,7 @@ return soup.find(class_ = "ms-rtestate-field").table def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getLinks(self, content, entry, doc): link = None --- /dev/null +++ b/documents/scrapers/54cbb3439276062b7a9f007f9f69d1f6.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/55b69726fde4b4898ecf6d7217d1d1d2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/5716ce0aacfe98f7d638b7a66b7f1040.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (date, id, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/5d05365e981d87e746b596d63e35b1dc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/601aedeef4344638d635bdd761e9fdba.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (date, title, description,notes) = columns + return (title, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/649b053f5e2884906ddc7174c2cd4b38.py @@ -1,1 +1,28 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + si.disclogURL = "http://www.fahcsia.gov.au/disclosure-log-2011-12-financial-year" + si.doScrape() + si.disclogURL = "http://www.fahcsia.gov.au/disclosure-log-2010-11-financial-year" + si.doScrape() + + --- /dev/null +++ b/documents/scrapers/655d4d67333536bda18d68265dfe7e80.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="node-30609") + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/656f7bb1884f4b9d31ebe2a5f5f58064.txt @@ -1,1 +1,2 @@ +list style --- /dev/null +++ b/documents/scrapers/65ec17101b00519e6d88c5a9f33c2c46.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, date, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/69d59284ef0ccd2677394d82d3292abc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "centercontent").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6ac74a939f420c6194ae29224809734a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6afdde1d4ff1ad8d8cfe1a8675ea83bd.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/6cf3870aedeeecfd6394b5c0abed4c55.py @@ -1,1 +1,23 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + --- /dev/null +++ b/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "content_div_50269").table + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/6fa04af95fbe7de96daa2c7560e0aad3.txt +++ /dev/null @@ -1,19 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(id = "content_div_50269").table - def getColumns(self,columns): - (id, date, title, description, notes) = columns - return (id, date, title, description, notes) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() - --- a/documents/scrapers/6fe3c812a99d486963133459b2768cf6.py +++ b/documents/scrapers/6fe3c812a99d486963133459b2768cf6.py @@ -8,7 +8,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/72a295f10734d64e8185f651fd2b39ea.txt @@ -1,1 +1,2 @@ +weird div based log with tables of links --- /dev/null +++ b/documents/scrapers/75d8f1c605ef9da0c2590264b7aa046b.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "content-middle").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/768bbbfb34115873af361af8519b38a9.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/77f02f713e3c37bff73882fb90828379.py @@ -1,1 +1,22 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find("table",width="571") +#findAll("table")[3] + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/794ea270edc9aa4f70f2a84bbc5ecc7a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "cphMain_C001_Col01").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/795c33ed030293dbdb155c909ea47e37.py +++ b/documents/scrapers/795c33ed030293dbdb155c909ea47e37.py @@ -10,7 +10,7 @@ return 7 def getColumns(self,columns): (id, date, title, description, notes, deletedate, otherinfo) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) #def getTable(self,soup): # return soup.find(class_ = "box").table --- /dev/null +++ b/documents/scrapers/795e7a8afb39a420360aa207b0cb1306.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/7b39ce7f362a0af9a711eaf223943eea.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.py @@ -1,1 +1,51 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from datetime import date +from pyquery import PyQuery as pq +from lxml import etree +import urllib +import dateutil +from dateutil.parser import * +class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + + d = pq(content) + d.make_links_absolute(base_url = self.getURL()) + for table in d('table').items(): + title= table('thead').text() + print title + (idate,descA,descB,link,deldate,notes) = table('tbody tr').map(lambda i, e: pq(e).children().eq(1).text()) + links = table('a').map(lambda i, e: pq(e).attr('href')) + description = descA+" "+descB + edate = parse(idate[:12], dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") + print edate + dochash = scrape.mkhash(self.remove_control_chars(title)) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + edate = date.today().strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "links": links, + "date": edate, "notes": notes, "title": title, "description": description} + #print doc + foidocsdb.save(doc) + else: + print "already saved" + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ACMADisclogScraper, + genericScrapers.GenericDisclogScraper) + print 'Instance:', isinstance(ACMADisclogScraper(), + genericScrapers.GenericDisclogScraper) + ACMADisclogScraper().doScrape() + --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt +++ /dev/null --- /dev/null +++ b/documents/scrapers/7ec28d7d97fcf493b1350acd03e3642e.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/7f55a3c42ad7460254906aa043a6e324.py @@ -1,1 +1,24 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getTitle(self, content, entry, doc): + doc.update({'title': content.stripped_strings.next()}) + return + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, id, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/8317df630946937864d31a4728ad8ee8.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/8796220032faf94501bd366763263685.py @@ -1,1 +1,37 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import dateutil +from dateutil.parser import * +from datetime import * + +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + def getColumnCount(self): + return 6 + + def getColumns(self, columns): + (id, date, title, description, datepub, notes) = columns + return (id, date, title, description, notes) + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + + nsi = ScraperImplementation() + nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/pmo/2011-12.cfm" + nsi.doScrape() + nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/dpmc/2011-12.cfm" + nsi.doScrape() + nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/dpmc/2012-13.cfm" + nsi.doScrape() + nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/omsi/2011-12.cfm" + nsi.doScrape() + nsi.disclogURL = "http://www.dpmc.gov.au/foi/ips/disclosure_logs/omps/2012-13.cfm" + nsi.doScrape() + --- /dev/null +++ b/documents/scrapers/8aae1c28db7f3ce10f232a0137be6bb2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py +++ b/documents/scrapers/8c9421f852c441910bf1d93a57b31d64.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -1,1 +1,85 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import codecs +#http://www.doughellmann.com/PyMOTW/abc/ +class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for text in soup.find(id="divFullWidthColumn").stripped_strings: + description = description + text.encode('ascii', 'ignore') + for atag in soup.find(id="divFullWidthColumn").find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(id = "TwoColumnSorting") + def getColumns(self,columns): + ( title, date) = columns + return (title, date, title, title, None) +class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for text in soup.find(id="content-item").stripped_strings: + description = description + text + " \n" + for atag in soup.find(id="content-item").find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(class_ = "doc-list") + def getColumns(self,columns): + (date, title) = columns + return (title, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + NewScraperImplementation().doScrape() + print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + osi = OldScraperImplementation() + osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" + osi.doScrape() + --- a/documents/scrapers/8ef0e5802f99800f514b3a148e013b75.py +++ b/documents/scrapers/8ef0e5802f99800f514b3a148e013b75.py @@ -12,7 +12,7 @@ return soup.find(class_ = "content").table def getColumns(self,columns): (id, date, title, description) = columns - return (id, date, description, title, None) + return (id, date, title, description, None) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/905a1c409b6afb1de0074b13a5559560.py @@ -1,1 +1,23 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import dateutil +from dateutil.parser import * +from datetime import * +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + si = ScraperImplementation() + si.doScrape() + --- /dev/null +++ b/documents/scrapers/9282306e244040c9e4ae5705f06f9548.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/93ce83e46f5c2c4ca1b7f199b59b4bd2.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date,logdate, description) = columns + return (id, date, description, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/99328d76c8efb56ff3f1da79b9d1b17f.txt @@ -1,1 +1,2 @@ +acma style --- /dev/null +++ b/documents/scrapers/9961dc45e046288ad1431941653af20c.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/9f4815bfdcb918a036e4bb43a30f8d77.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/9f5cd66dea3e2ec958c17e28b27e60a7.txt @@ -1,1 +1,2 @@ +acma style --- /dev/null +++ b/documents/scrapers/a1ab9c80ab473958676c62c1a25dd502.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a43467fe82b840a353b380c4d7462a4c.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (date, title, description) = columns + return (date, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/a687a9eaab9e10e9e118d3fd7cf0e13a.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="ctl00_ContentPlaceHolderMainNoAjax_EdtrTD1494_2").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (blank,id, title,date) = columns + return (id, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ad033512610d8e36886ab6a795f26561.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "_ctl0__ctl0_MainContentPlaceHolder_MainContentPlaceHolder_ContentSpan").findAll("table")[3] + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b0a3281ba66efe173c5a33d5ef90ff76.py @@ -1,1 +1,35 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import dateutil +from dateutil.parser import * +from datetime import * + +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + def getColumnCount(self): + return 2 + + def getColumns(self, columns): + (date, title) = columns + return (title, date, title, title, None) + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + + nsi = ScraperImplementation() + nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2012.htm" + nsi.doScrape() + nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2011.htm" + nsi.doScrape() + nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2010.htm" + nsi.doScrape() + nsi.disclogURL = "http://www.immi.gov.au/about/foi/foi-disclosures-2009.htm" + nsi.doScrape() + --- /dev/null +++ b/documents/scrapers/b0fb402314e685238537105ee0e70c84.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b506b87c8ee9e3a7ea8007914078c741.py @@ -1,1 +1,19 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description,link,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b7770c4584332cff42bb6abb3326e564.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_Content__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/b91f866928eb61959dbbab56313214fc.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt +++ /dev/null @@ -1,2 +1,1 @@ -# does not have any disclog entries or table --- /dev/null +++ b/documents/scrapers/bc91b878e2317fa231cc2c512e2027f0.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bf16d4ba0d306ee03e5a1d32aaba3da1.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(summary="This table shows every FOI request to date.") + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt +++ /dev/null @@ -1,1 +1,1 @@ -no disclog yet + --- a/documents/scrapers/c1302c8d7cbbd911f0d4d8a4128f8079.txt +++ b/documents/scrapers/c1302c8d7cbbd911f0d4d8a4128f8079.txt @@ -1,1 +1,1 @@ -uses RET disclog +parent --- /dev/null +++ b/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(class_ = "ms-rtestate-field").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, date, title, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/c25f628f9f38d889485d7a4bff873b23.txt +++ /dev/null @@ -1,20 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - - def getColumnCount(self): - return 4 - def getColumns(self,columns): - (id, date, title, description) = columns - return (id, date, title, description, None) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() - --- a/documents/scrapers/c43ca6780764f4e61918e8836be74420.py +++ b/documents/scrapers/c43ca6780764f4e61918e8836be74420.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title,description,notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) --- /dev/null +++ b/documents/scrapers/c57c0bf315ce5977e730905707a2f6a3.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/c57c0bf315ce5977e730905707a2f6a3.txt +++ /dev/null @@ -1,3 +1,1 @@ -# pdf -http://www.awm.gov.au/about/AWM_Disclosure_Log.pdf --- /dev/null +++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/cca17a34bd490474a316fe0a1ca03c25.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_ctl01__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/cde8eb4a2e40abb18d8b28d3b85bc9b0.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(summary="This table lists the schedule of upcoming courses.") + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ce34d1e9b55911e4272d2d388821f311.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/d1296c366287f7a9faedf235c7e6df01.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id="main").table + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.py @@ -1,1 +1,49 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from datetime import date +from pyquery import PyQuery as pq +from lxml import etree +import urllib +import dateutil +from dateutil.parser import * +class ACMADisclogScraper(genericScrapers.GenericDisclogScraper): + + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + + d = pq(content) + d.make_links_absolute(base_url = self.getURL()) + for item in d('.item-list').items(): + title= item('h3').text() + print title + links = item('a').map(lambda i, e: pq(e).attr('href')) + description = title= item('ul').text() + edate = date.today().strftime("%Y-%m-%d") + print edate + dochash = scrape.mkhash(self.remove_control_chars(title)) + doc = foidocsdb.get(dochash) + if doc is None: + print "saving " + dochash + doc = {'_id': dochash, 'agencyID': self.getAgencyID() + , 'url': self.getURL(), 'docID': dochash, + "links": links, + "date": edate, "title": title, "description": description} + #print doc + foidocsdb.save(doc) + else: + print "already saved" + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ACMADisclogScraper, + genericScrapers.GenericDisclogScraper) + print 'Instance:', isinstance(ACMADisclogScraper(), + genericScrapers.GenericDisclogScraper) + ACMADisclogScraper().doScrape() + --- a/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt +++ /dev/null --- /dev/null +++ b/documents/scrapers/dae7e934f1c341ccc9547a89a8af917e.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py +++ b/documents/scrapers/dfd7414bb0c21a0076ab559901ae0588.py @@ -8,7 +8,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getTable(self,soup): return soup.find(class_ = "content") --- /dev/null +++ b/documents/scrapers/e0614dc3a9e25d375370ffd82f7165ac.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 6 + def getColumns(self,columns): + (id, date, title, description,deldate, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py +++ b/documents/scrapers/e2a845e55bc9986e6c75c5ad2c508b8d.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) --- /dev/null +++ b/documents/scrapers/e64c71f4986f78675a252104c5a5f359.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 5 + def getColumns(self,columns): + (id, date, title, description,notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/e90b1b7cbb83e3eed0b5f849c7e3af79.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "inner_content") + def getColumnCount(self): + return 2 + def getColumns(self,columns): + (date, title) = columns + return (date, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/ee30aad97f0bb32e74c4587404b67ce4.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + #def getTable(self,soup): + # return soup.find(id = "ctl00_PlaceHolderMain_intro2__ControlWrapper_CerRichHtmlField").table + def getColumnCount(self): + return 4 + def getColumns(self,columns): + (id, title, date, description) = columns + return (id, date, title, description, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- /dev/null +++ b/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "genericContent").table.tbody + def getColumnCount(self): + return 3 + def getColumns(self,columns): + (id, date,title, description, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/f0caafbcf292c90e7b8ad18ddcf9afc3.txt +++ /dev/null @@ -1,21 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getTable(self,soup): - return soup.find(id = "genericContent").table.tbody - def getColumnCount(self): - return 5 - def getColumns(self,columns): - (id, date,title, description, notes) = columns - return (id, date, title, description, notes) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() - --- /dev/null +++ b/documents/scrapers/f189459fc43f941e0d4ecfba52c666f3.py @@ -1,1 +1,19 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers + +class ScraperImplementation(genericScrapers.GenericHTMLDisclogScraper): + + def __init__(self): + super(ScraperImplementation, self).__init__() + + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, + genericScrapers.GenericHTMLDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), + genericScrapers.GenericHTMLDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py +++ b/documents/scrapers/f2ab2908d8ee56ed8d995ef4187e75e6.py @@ -8,7 +8,7 @@ class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) def getTable(self,soup): return soup.find(id = "content").table --- /dev/null +++ b/documents/scrapers/f5ce2d1651739704634eb8ca4b2b46d3.py @@ -1,1 +1,21 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +#http://www.doughellmann.com/PyMOTW/abc/ +class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getTable(self,soup): + return soup.find(id = "ctl00_PlaceHolderMain_PublishingPageContent__ControlWrapper_RichHtmlField").table + def getColumnCount(self): + return 7 + def getColumns(self,columns): + (id, date, title, description,link,deldate, notes) = columns + return (id, date, title, description, notes) + +if __name__ == '__main__': + print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + ScraperImplementation().doScrape() + --- a/documents/scrapers/rtk.py +++ b/documents/scrapers/rtk.py @@ -7,7 +7,7 @@ class ScraperImplementation(genericScrapers.GenericRSSDisclogScraper): def getColumns(self,columns): (id, date, title, description, notes) = columns - return (id, date, description, title, notes) + return (id, date, title, description, notes) if __name__ == '__main__': print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericRSSDisclogScraper) --- /dev/null +++ b/documents/sitemap.xml.php @@ -1,1 +1,33 @@ +"; +echo '' . "\n"; +echo " " . local_url() . "index.php1.0\n"; +foreach (scandir("./") as $file) { + if (strpos($file, ".php") !== false && $file != "index.php" && $file != "sitemap.xml.php") + echo " " . local_url() . "$file0.6\n"; +} +$agenciesdb = $server->get_db('disclosr-agencies'); +try { + $rows = $agenciesdb->get_view("app", "byCanonicalName")->rows; + foreach ($rows as $row) { + echo '' . local_url() . 'agency.php?id=' . $row->value->_id . "0.3\n"; + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +$foidocsdb = $server->get_db('disclosr-foidocuments'); +try { + $rows = $foidocsdb->get_view("app", "all")->rows; + foreach ($rows as $row) { + echo '' . local_url() . 'view.php?id=' . $row->value->_id . "0.3\n"; + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +echo ''; +?> + --- a/documents/template.inc.php +++ b/documents/template.inc.php @@ -1,150 +1,183 @@ - - - - - - - - - + header('X-UA-Compatible: IE=edge,chrome=1'); + ?> + + + + + + + + + - - + Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?> + - Australian Disclosure Logs<?php if ($title != "") echo " - $title";?> - + + + + + - - + + + + - - + + + - - - - + + + - - - + + + +
+ -

© Company 2012

- - - - - + (function() { + var ga = document.createElement('script'); + ga.type = 'text/javascript'; + ga.async = true; + ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; + var s = document.getElementsByTagName('script')[0]; + s.parentNode.insertBefore(ga, s); + })(); + + "; + } + ?> + + + - - - + + $length) { + //limit hit! + $string = substr($string, 0, ($length - 3)); + if ($stopanywhere) { + //stop anywhere + $string .= '...'; + } else { + //stop on a word. + $string = substr($string, 0, strrpos($string, ' ')) . '...'; + } + } + return $string; } function displayLogEntry($row, $idtoname) { - echo "

".$row->value->date.": ".$row->value->title." (".$idtoname[$row->value->agencyID].")

".str_replace("\n","
",$row->value->description); -if (isset($row->value->notes)) { -echo "
Note: ".$row->value->notes; -} -echo "

"; + $result = ""; + $result .= '
'; + $result .= '

: " . truncate($row->value->title, 120) . ""; + $result .= ' (' . $idtoname[$row->value->agencyID] . ')

'; + $result .= "

Title: " . $row->value->title . "
"; + if (isset($row->value->description)) { + $result .= str_replace("\n", "
", preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "",trim($row->value->description))); + } + if (isset($row->value->notes)) { + $result .= "
Note: " . $row->value->notes; + } + $result .= "

"; -if (isset($row->value->links)){ -echo "

Links/Documents

    "; -foreach ($row->value->links as $link) { - echo "
  • ".$link."
  • "; + if (isset($row->value->links)) { + $result .= '

    Links/Documents

    "; + } + $result .= " ID: " . strip_tags($row->value->docID) . ""; + $result .= "
\n"; + return $result; } - echo ""; -} - echo "View original source... ID: ".$row->value->docID.""; -echo"
"; -} - --- /dev/null +++ b/documents/view.php @@ -1,1 +1,28 @@ + +get_db('disclosr-agencies'); + +$idtoname = Array(); +foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) { + $idtoname[$row->id] = trim($row->value->name); +} +$foidocsdb = $server->get_db('disclosr-foidocuments'); +try { + $obj = new stdClass(); + $obj->value = $foidocsdb->get($_REQUEST['id']); + include_header_documents($obj->value->title); + +echo displayLogEntry($obj,$idtoname); + +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +include_footer_documents(); +?> + --- a/documents/viewDocument.php +++ b/documents/viewDocument.php @@ -3,7 +3,13 @@ include_once('../include/common.inc.php'); $hash = $_REQUEST['hash']; $docsdb = $server->get_db('disclosr-documents'); +try { $doc = object_to_array($docsdb->get($hash)); + +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} + if (!isset($doc['_attachments']) || count($doc['_attachments']) == 0) die ("no attachments"); $attachments = $doc['_attachments']; @@ -13,3 +19,4 @@ //echo $url; $request = Requests::get($url); echo ($request->body); + --- a/getAgency.php +++ b/getAgency.php @@ -5,11 +5,11 @@ function displayValue($key, $value, $mode) { global $db, $schemas; if ($mode == "view") { - if (strpos($key, "_") === 0 || $key == "metadata") + if (strpos($key, "_") === 0 || $key == "metadata" || $key == "metaTags" || $key == "statistics") return; echo ""; - echo ""; + echo ""; if (isset($schemas['agency']["properties"][$key])) { echo $schemas['agency']["properties"][$key]['x-title'] . "
" . $schemas['agency']["properties"][$key]['description'] . ""; } @@ -202,8 +202,10 @@ try { $rows = $db->get_view("app", "byCanonicalName")->rows; //print_r($rows); + $rowCount = count($rows); echo '