From: Maxious Date: Sat, 01 Dec 2012 07:50:43 +0000 Subject: rm settee X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=5efdc83287d1abe104a141b533c11000bf51bbb5 --- rm settee Former-commit-id: 2fe3d86753e524fca7ee4e095a794727c0556d79 --- --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,6 @@ [submodule "couchdb/couchdb-lucene"] path = couchdb/couchdb-lucene url = https://github.com/rnewson/couchdb-lucene.git -[submodule "couchdb/settee"] - path = couchdb/settee - url = https://github.com/inadarei/settee.git [submodule "lib/php-diff"] path = lib/php-diff url = https://github.com/chrisboulton/php-diff.git --- a/admin/refreshDesignDoc.php +++ b/admin/refreshDesignDoc.php @@ -4,74 +4,62 @@ //function createFOIDocumentsDesignDoc() { $foidb = $server->get_db('disclosr-foidocuments'); - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode("app"); - $obj->language = "javascript"; - $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; - $obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; - $obj->views->byDate->reduce = "_count"; - $obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; - $obj->views->byAgencyID->reduce = "_count"; +$obj = new stdClass(); +$obj->_id = "_design/" . urlencode("app"); +$obj->language = "javascript"; +$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; +$obj->views->byDate->map = "function(doc) { emit(doc.date, doc); };"; +$obj->views->byDate->reduce = "_count"; +$obj->views->byDateMonthYear->map = "function(doc) { emit(doc.date, doc); };"; +$obj->views->byDateMonthYear->reduce = "_count"; +$obj->views->byAgencyID->map = "function(doc) { emit(doc.agencyID, doc); };"; +$obj->views->byAgencyID->reduce = "_count"; - // allow safe updates (even if slightly slower due to extra: rev-detection check). - $foidb->save($obj, true); +// allow safe updates (even if slightly slower due to extra: rev-detection check). +$foidb->save($obj, true); -function createDocumentsDesignDoc() { - /* - global $db; - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode("app"); - $obj->language = "javascript"; - $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; - $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; - "views": { - "web_server": { - "map": "function(doc) {\n emit(doc.web_server, 1);\n}", - "reduce": "function (key, values, rereduce) {\n return sum(values);\n}" - }, - "byAgency": { - "map": "function(doc) {\n emit(doc.agencyID, 1);\n}", - "reduce": "function (key, values, rereduce) {\n return sum(values);\n}" - }, - "byURL": { - "map": "function(doc) {\n emit(doc.url, doc);\n}" - }, - "agency": { - "map": "function(doc) {\n emit(doc.agencyID, doc);\n}" - }, - "byWebServer": { - "map": "function(doc) {\n emit(doc.web_server, doc);\n}" - }, - "getValidationRequired": { - "map": "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}" - } - } */ -} +//function createDocumentsDesignDoc() { +$docdb = $server->get_db('disclosr-documents'); + +$obj = new stdClass(); +$obj->_id = "_design/" . urlencode("app"); +$obj->language = "javascript"; +$obj->views->web_server->map = "function(doc) {\n emit(doc.web_server, 1);\n}"; +$obj->views->web_server->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; +$obj->views->byAgency->map = "function(doc) {\n emit(doc.agencyID, 1);\n}"; +$obj->views->byAgency->reduce = "function (key, values, rereduce) {\n return sum(values);\n}"; +$obj->views->byURL->map = "function(doc) {\n emit(doc.url, doc);\n}"; +$obj->views->agency->map = "function(doc) {\n emit(doc.agencyID, doc);\n}"; +$obj->views->byWebServer->map = "function(doc) {\n emit(doc.web_server, doc);\n}"; +$obj->views->getValidationRequired = "function(doc) {\nif (doc.mime_type == \"text/html\" \n&& typeof(doc.validation) == \"undefined\") {\n emit(doc._id, doc._attachments);\n}\n}"; + + + //function createAgencyDesignDoc() { $db = $server->get_db('disclosr-agencies'); - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode("app"); - $obj->language = "javascript"; - $obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; - $obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; - $obj->views->byCanonicalName->map = "function(doc) { +$obj = new stdClass(); +$obj->_id = "_design/" . urlencode("app"); +$obj->language = "javascript"; +$obj->views->all->map = "function(doc) { emit(doc._id, doc); };"; +$obj->views->byABN->map = "function(doc) { emit(doc.abn, doc); };"; +$obj->views->byCanonicalName->map = "function(doc) { if (doc.parentOrg || doc.orgType == 'FMA-DepartmentOfState') { emit(doc.name, doc); } };"; - $obj->views->byDeptStateName->map = "function(doc) { +$obj->views->byDeptStateName->map = "function(doc) { if (doc.orgType == 'FMA-DepartmentOfState') { emit(doc.name, doc._id); } };"; - $obj->views->parentOrgs->map = "function(doc) { +$obj->views->parentOrgs->map = "function(doc) { if (doc.parentOrg) { emit(doc._id, doc.parentOrg); } };"; - $obj->views->byName->map = 'function(doc) { +$obj->views->byName->map = 'function(doc) { if (typeof(doc["status"]) == "undefined" || doc["status"] != "suspended") { emit(doc.name, doc._id); if (typeof(doc.shortName) != "undefined" && doc.shortName != doc.name) { @@ -95,14 +83,14 @@ } };'; - $obj->views->foiEmails->map = "function(doc) { +$obj->views->foiEmails->map = "function(doc) { emit(doc._id, doc.foiEmail); };"; - $obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; - $obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; - $obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; - $obj->views->getScrapeRequired->map = "function(doc) { +$obj->views->byLastModified->map = "function(doc) { emit(doc.metadata.lastModified, doc); }"; +$obj->views->getActive->map = 'function(doc) { if (doc.status == "active") { emit(doc._id, doc); } };'; +$obj->views->getSuspended->map = 'function(doc) { if (doc.status == "suspended") { emit(doc._id, doc); } };'; +$obj->views->getScrapeRequired->map = "function(doc) { var lastScrape = Date.parse(doc.metadata.lastScraped); @@ -113,14 +101,14 @@ } };"; - $obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; - $obj->views->getConflicts->map = "function(doc) { +$obj->views->showNamesABNs->map = "function(doc) { emit(doc._id, {name: doc.name, abn: doc.abn}); };"; +$obj->views->getConflicts->map = "function(doc) { if (doc._conflicts) { emit(null, [doc._rev].concat(doc._conflicts)); } }"; - // http://stackoverflow.com/questions/646628/javascript-startswith - $obj->views->score->map = 'if(!String.prototype.startsWith){ +// http://stackoverflow.com/questions/646628/javascript-startswith +$obj->views->score->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { return !this.indexOf(str); } @@ -144,7 +132,7 @@ emit(count+doc._id, {id:doc._id, name: doc.name, score:count, orgType: doc.orgType, portfolio:portfolio}); } }'; - $obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ +$obj->views->scoreHas->map = 'if(!String.prototype.startsWith){ String.prototype.startsWith = function (str) { return !this.indexOf(str); } @@ -164,22 +152,20 @@ emit("total", 1); } }'; - $obj->views->scoreHas->reduce = 'function (key, values, rereduce) { +$obj->views->scoreHas->reduce = 'function (key, values, rereduce) { return sum(values); }'; - $obj->views->fieldNames->map = ' +$obj->views->fieldNames->map = ' function(doc) { for(var propName in doc) { emit(propName, doc._id); } }'; - $obj->views->fieldNames->reduce = 'function (key, values, rereduce) { +$obj->views->fieldNames->reduce = 'function (key, values, rereduce) { return values.length; }'; - // allow safe updates (even if slightly slower due to extra: rev-detection check). - $db->save($obj, true); - - +// allow safe updates (even if slightly slower due to extra: rev-detection check). +$db->save($obj, true); ?> --- a/couchdb/SetteeDatabase.class.php +++ /dev/null @@ -1,306 +1,1 @@ -conn_url = $conn_url; - $this->dbname = $dbname; - $this->rest_client = SetteeRestClient::get_instance($this->conn_url); - } - - - /** - * Get UUID from CouchDB - * - * @return - * CouchDB-generated UUID string - * - */ - function gen_uuid() { - $ret = $this->rest_client->http_get('_uuids'); - return $ret['decoded']->uuids[0]; // should never be empty at this point, so no checking - } - - /** - * Create or update a document database - * - * @param $document - * PHP object, a PHP associative array, or a JSON String representing the document to be saved. PHP Objects and arrays are JSON-encoded automatically. - * - *

If $document has a an "_id" property set, it will be used as document's unique id (even for "create" operation). - * If "_id" is missing, CouchDB will be used to generate a UUID. - * - *

If $document has a "_rev" property (revision), document will be updated, rather than creating a new document. - * You have to provide "_rev" if you want to update an existing document, otherwise operation will be assumed to be - * one of creation and you will get a duplicate document exception from CouchDB. Also, you may not provide "_rev" but - * not provide "_id" since that is an invalid input. - * - * @param $allowRevAutoDetection - * Default: false. When true and _rev is missing from the document, save() function will auto-detect latest revision - * for a document and use it. This option is "false" by default because it involves an extra http HEAD request and - * therefore can make save() operation slightly slower if such auto-detection is not required. - * - * @return - * document object with the database id (uuid) and revision attached; - * - * @throws SetteeCreateDatabaseException - */ - function save($document, $allowRevAutoDetection = false) { - if (is_string($document)) { - $document = json_decode($document); - } - - // Allow passing of $document as an array (for syntactic simplicity and also because in JSON world it does not matter) - if(is_array($document)) { - $document = (object) $document; - } - - if (empty($document->_id) && empty($document->_rev)) { - $id = $this->gen_uuid(); - } - elseif (empty($document->_id) && !empty($document->_rev)) { - throw new SetteeWrongInputException("Error: You can not save a document with a revision provided, but missing id"); - } - else { - $id = $document->_id; - - if ($allowRevAutoDetection) { - try { - $rev = $this->get_rev($id); - } catch (SetteeRestClientException $e) { - // auto-detection may fail legitimately, if a document has never been saved before (new doc), so skipping error - } - if (!empty($rev)) { - $document->_rev = $rev; - } - } - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); - $document_json = json_encode($document, JSON_NUMERIC_CHECK); - - $ret = $this->rest_client->http_put($full_uri, $document_json); - - $document->_id = $ret['decoded']->id; - $document->_rev = $ret['decoded']->rev; - - return $document; - } - - /** - * @param $doc - * @param $name - * @param $content - * Content of the attachment in a string-buffer format. This function will automatically base64-encode content for - * you, so you don't have to do it. - * @param $mime_type - * Optional. Will be auto-detected if not provided - * @return void - */ - public function add_attachment($doc, $name, $content, $mime_type = null) { - if (empty($doc->_attachments) || !is_object($doc->_attachments)) { - $doc->_attachments = new stdClass(); - } - - if (empty($mime_type)) { - $mime_type = $this->rest_client->content_mime_type($content); - } - - $doc->_attachments->$name = new stdClass(); - $doc->_attachments->$name->content_type = $mime_type; - $doc->_attachments->$name->data = base64_encode($content); - } - - /** - * @param $doc - * @param $name - * @param $file - * Full path to a file (e.g. as returned by PHP's realpath function). - * @param $mime_type - * Optional. Will be auto-detected if not provided - * @return void - */ - public function add_attachment_file($doc, $name, $file, $mime_type = null) { - $content = file_get_contents($file); - $this->add_attachment($doc, $name, $content, $mime_type); - } - - /** - * - * Retrieve a document from CouchDB - * - * @throws SetteeWrongInputException - * - * @param $id - * Unique ID (usually: UUID) of the document to be retrieved. - * @return - * database document in PHP object format. - */ - function get($id) { - if (empty($id)) { - throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); -$full_uri = str_replace("%3Frev%3D","?rev=",$full_uri); - $ret = $this->rest_client->http_get($full_uri); - return $ret['decoded']; - } - - /** - * - * Get the latest revision of a document with document id: $id in CouchDB. - * - * @throws SetteeWrongInputException - * - * @param $id - * Unique ID (usually: UUID) of the document to be retrieved. - * @return - * database document in PHP object format. - */ - function get_rev($id) { - if (empty($id)) { - throw new SetteeWrongInputException("Error: Can't query a document without a uuid."); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); - $headers = $this->rest_client->http_head($full_uri); - if (empty($headers['Etag'])) { - throw new SetteeRestClientException("Error: could not retrieve revision. Server unexpectedly returned empty Etag"); - } - $etag = str_replace('"', '', $headers['Etag']); - return $etag; - } - - /** - * Delete a document - * - * @param $document - * a PHP object or JSON representation of the document that has _id and _rev fields. - * - * @return void - */ - function delete($document) { - if (!is_object($document)) { - $document = json_decode($document); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($document->_id) . "?rev=" . $document->_rev; - $this->rest_client->http_delete($full_uri); - } - - - /*----------------- View-related functions --------------*/ - - /** - * Create a new view or update an existing one. - * - * @param $design_doc - * @param $view_name - * @param $map_src - * Source code of the map function in Javascript - * @param $reduce_src - * Source code of the reduce function in Javascript (optional) - * @return void - */ - function save_view($design_doc, $view_name, $map_src, $reduce_src = null) { - $obj = new stdClass(); - $obj->_id = "_design/" . urlencode($design_doc); - $view_name = urlencode($view_name); - $obj->views->$view_name->map = $map_src; - if (!empty($reduce_src)) { - $obj->views->$view_name->reduce = $reduce_src; - } - - // allow safe updates (even if slightly slower due to extra: rev-detection check). - return $this->save($obj, true); - } - - /** - * Create a new view or update an existing one. - * - * @param $design_doc - * @param $view_name - * @param $key - * key parameter to a view. Can be a single value or an array (for a range). If passed an array, function assumes - * that first element is startkey, second: endkey. - * @param $descending - * return results in descending order. Please don't forget that if you are using a startkey/endkey, when you change - * order you also need to swap startkey and endkey values! - * - * @return void - */ - function get_view($design_doc, $view_name, $key = null, $descending = false) { - $id = "_design/" . urlencode($design_doc); - $view_name = urlencode($view_name); - $id .= "/_view/$view_name"; - - $data = array(); - if (!empty($key)) { - if (is_string($key)) { - $data = "key=" . '"' . $key . '"'; - } - elseif (is_array($key)) { - list($startkey, $endkey) = $key; - $data = "startkey=" . '"' . $startkey . '"&' . "endkey=" . '"' . $endkey . '"'; - } - - if ($descending) { - $data .= "&descending=true"; - } - } - - - - if (empty($id)) { - throw new SetteeWrongInputException("Error: Can't retrieve a document without a uuid."); - } - - $full_uri = $this->dbname . "/" . $this->safe_urlencode($id); -$full_uri = str_replace("%253Fgroup%253Dtrue","?group=true",$full_uri); - $ret = $this->rest_client->http_get($full_uri, $data); - return $ret['decoded']; - - } - - /** - * @param $id - * @return - * return a properly url-encoded id. - */ - private function safe_urlencode($id) { - //-- System views like _design can have "/" in their URLs. - $id = rawurlencode($id); - if (substr($id, 0, 1) == '_') { - $id = str_replace('%2F', '/', $id); - } - return $id; - } - - /** Getter for a database name */ - function get_name() { - return $this->dbname; - } - -} --- a/couchdb/settee +++ /dev/null --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -12,6 +12,7 @@ $disclogs = 0; $red = 0; $green = 0; +$yellow = 0; $orange = 0; try { $rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows; @@ -46,8 +47,14 @@ echo ""; $green++; } else if (file_exists("./scrapers/" . $row->id . '.txt')) { + if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") { + echo ""; + $yellow++; + } else { + echo file_get_contents("./scrapers/" . $row->id . '.txt'); echo ""; $orange++; + } } else { echo ""; $red++; --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -9,6 +9,7 @@ import dateutil from dateutil.parser import * from datetime import * +import codecs class GenericDisclogScraper(object): __metaclass__ = abc.ABCMeta @@ -93,7 +94,7 @@ def getDate(self, content, entry, doc): date = ''.join(content.stripped_strings).strip() (a,b,c) = date.partition("(") - date = a.replace("Octber","October") + date = self.remove_control_chars(a.replace("Octber","October")) print date edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d") print edate @@ -120,7 +121,7 @@ columns = row.find_all('td') if len(columns) == self.getColumnCount(): (id, date, title, description, notes) = self.getColumns(columns) - print ''.join(id.stripped_strings) + print self.remove_control_chars(''.join(id.stripped_strings)) if id.string == None: hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings)))) else: --- a/documents/index.php +++ b/documents/index.php @@ -1,11 +1,11 @@ get_db('disclosr-agencies'); @@ -15,17 +15,17 @@ } $foidocsdb = $server->get_db('disclosr-foidocuments'); try { - $rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows; - - + $rows = $foidocsdb->get_view("app", "byDate", Array($startkey, '0000-00-00'), true, 20)->rows; if ($rows) { - foreach ($rows as $row) { -echo displayLogEntry($row,$idtoname); + foreach ($rows as $key => $row) { + echo displayLogEntry($row, $idtoname); + $endkey = $row->key; } } } catch (SetteeRestClientException $e) { setteErrorHandler($e); } +echo "next page"; include_footer_documents(); ?> --- a/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt +++ b/documents/scrapers/627f116dfe42c9f27ad6747be0aa44e2.txt @@ -1,2 +1,1 @@ -see parent dhs - +no disclog --- a/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt +++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt @@ -1,1 +1,1 @@ - +acma style --- /dev/null +++ b/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.py @@ -1,1 +1,86 @@ +import sys,os +sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) +import genericScrapers +import scrape +from bs4 import BeautifulSoup +import codecs +#http://www.doughellmann.com/PyMOTW/abc/ +class NewScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for text in soup.find(id="divFullWidthColumn").stripped_strings: + description = description + text.encode('ascii', 'ignore') + + for atag in soup.find(id="divFullWidthColumn").find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(id = "TwoColumnSorting") + def getColumns(self,columns): + ( title, date) = columns + return (title, date, title, title, None) +class OldScraperImplementation(genericScrapers.GenericOAICDisclogScraper): + def getDescription(self,content, entry,doc): + link = None + links = [] + description = "" + for atag in entry.find_all('a'): + if atag.has_key('href'): + link = scrape.fullurl(self.getURL(),atag['href']) + (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) + if htcontent != None: + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(htcontent) + for text in soup.find(id="content-item").stripped_strings: + description = description + text + " \n" + for atag in soup.find(id="content-item").find_all("a"): + if atag.has_key('href'): + links.append(scrape.fullurl(link,atag['href'])) + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + if links != []: + doc.update({'links': links}) + if description != "": + doc.update({ 'description': description}) + + def getColumnCount(self): + return 2 + def getTable(self,soup): + return soup.find(class_ = "doc-list") + def getColumns(self,columns): + (date, title) = columns + return (title, date, title, title, None) + +if __name__ == '__main__': + print 'Subclass:', issubclass(NewScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(NewScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + #NewScraperImplementation().doScrape() + print 'Subclass:', issubclass(OldScraperImplementation, genericScrapers.GenericOAICDisclogScraper) + print 'Instance:', isinstance(OldScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) + osi = OldScraperImplementation() + osi.disclogURL = "http://archive.treasury.gov.au/content/foi_publications.asp?year=-1&abstract=0&classification=&=&titl=Disclosure+Log+-+Documents+Released+Under+FOI" + osi.doScrape() +# old site too + --- a/documents/scrapers/8e874a2fde8aa0ccdc6d14573d766540.txt +++ /dev/null @@ -1,49 +1,1 @@ -import sys,os -sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) -import genericScrapers -import scrape -from bs4 import BeautifulSoup -#http://www.doughellmann.com/PyMOTW/abc/ -class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): - def getDescription(self,content, entry,doc): - link = None - links = [] - description = "" - for atag in entry.find_all('a'): - if atag.has_key('href'): - link = scrape.fullurl(self.getURL(),atag['href']) - (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False) - if htcontent != None: - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(htcontent) - for row in soup.find(class_ = "ms-rteTable-GreyAlternating").find_all('tr'): - if row != None: - rowtitle = row.find('th').string - description = description + "\n" + rowtitle + ": " - for text in row.find('td').stripped_strings: - description = description + text - for atag in row.find_all("a"): - if atag.has_key('href'): - links.append(scrape.fullurl(link,atag['href'])) - - if links != []: - doc.update({'links': links}) - if description != "": - doc.update({ 'description': description}) - - def getColumnCount(self): - return 2 - def getTable(self,soup): - return soup.find(class_ = "ms-rteTable-GreyAlternating") - def getColumns(self,columns): - (date, title) = columns - return (title, date, title, title, None) - -if __name__ == '__main__': - print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) - print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) - ScraperImplementation().doScrape() -# old site too http://archive.treasury.gov.au/content/foi_publications.asp - --- a/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt +++ b/documents/scrapers/bb96fe4065afb7e0872136dd657f9369.txt @@ -1,2 +1,1 @@ -# does not have any disclog entries or table - +no disclog --- a/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt +++ b/documents/scrapers/bf6e587f166040b63681cd2ff76fbfdf.txt @@ -1,1 +1,1 @@ -no disclog yet +no disclog --- a/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt +++ b/documents/scrapers/cb7f40e3495b682de6eee61bf09c1cfc.txt @@ -1,2 +1,1 @@ -no log - +no disclog --- a/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt +++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt @@ -1,1 +1,1 @@ - +acma style --- a/documents/scrapers/e770921522a49dc77de208cc724ce134.txt +++ b/documents/scrapers/e770921522a49dc77de208cc724ce134.txt @@ -1,2 +1,1 @@ -c'est ne pas une table - +no disclog --- a/documents/template.inc.php +++ b/documents/template.inc.php @@ -145,7 +145,7 @@ $result .= ""; } - $result .= "View original source... ID: ".$row->value->docID.""; + $result .= "View original source... ID: ".strip_tags($row->value->docID).""; $result .= ""; return $result; }