From: Maxious Date: Tue, 18 Dec 2012 22:20:12 +0000 Subject: columns X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=7c78bd9f790ed5a9a2a3483346a9cb6449eddba1 --- columns Former-commit-id: 82edd5f41bab243828a5febd9e00b5fdb051dc86 --- --- a/admin/genericAgencyFixer.php +++ b/admin/genericAgencyFixer.php @@ -7,28 +7,48 @@ $db = $server->get_db('disclosr-agencies'); +// metatags +try { + $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; + //print_r($rows); + foreach ($agencies as $agency) { + if (isset($agency->value->scrapeDepth)) { + unset($agency->value->scrapeDepth); + } + if (isset($agency->value->lastScraped)) { + unset($agency->value->lastScraped); + } + $db->save($agency->value); + echo "
"; + flush(); + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +// metatags try { $agencies = $db->get_view("app", "byCanonicalName", null, true)->rows; //print_r($rows); foreach ($agencies as $agency) { //echo $agency->value->name . " ".$agency->value->website."
\n"; - // print_r($agency); + // print_r($agency); //hasRestricitiveLicence" hasRestrictiveLicense -> has Restrictive Licence // "hasYoutube" -> Tube // "comment" -> "comments" if (!isset($agency->value->metaTags) && isset($agency->value->website)) { - echo $agency->value->name . " ".$agency->value->website."
\n"; + echo $agency->value->name . " " . $agency->value->website . "
\n"; $agency->value->metaTags = Array(); $request = Requests::get($agency->value->website); $html = phpQuery::newDocumentHTML($request->body); phpQuery::selectDocument($html); foreach (pq('meta')->elements as $meta) { - $tagName = $meta->getAttribute('name');; + $tagName = $meta->getAttribute('name'); + ; $content = $meta->getAttribute('content'); if ($tagName != "") { -echo "$tagName == $content
\n"; - $agency->value->metaTags[$tagName] = $content; + echo "$tagName == $content
\n"; + $agency->value->metaTags[$tagName] = $content; } } //print_r($agency->value->metaTags); --- a/admin/importRTKbodies.php +++ b/admin/importRTKbodies.php @@ -29,6 +29,7 @@ } else { echo $Row[array_search($nameField, $headers)] . PHP_EOL; $accounts[$nametoid[trim($agencyName)]]["rtkURLs"][$agencyName] = 'http://www.righttoknow.org.au/body/'.$Row[array_search($accountField, $headers)]; + $accounts[$nametoid[trim($agencyName)]]["rtkDescriptions"][$agencyName] = $Row[array_search("Notes", $headers)]; } } else { echo "error finding any agency" . $line . PHP_EOL; @@ -38,19 +39,26 @@ } extractCSVAccounts("http://www.righttoknow.org.au/body/all-authorities.csv","Agency","URL name"); -print_r($accounts); -/* foreach ($accounts as $id => $accountTypes) { +//print_r($accounts); + foreach ($accounts as $id => $allvalues) { echo $id . "
" . PHP_EOL; $doc = object_to_array($db->get($id)); // print_r($doc); - foreach ($accountTypes as $accountType => $accounts) { - if (!isset($doc["has" . $accountType]) || !is_array($doc["has" . $accountType])) { - $doc["has" . $accountType] = Array(); + foreach ($allvalues as $valueType => $values) { + if (!isset($doc[ $valueType]) || !is_array($doc[ $valueType])) { + $doc[ $valueType] = Array(); } - $doc["has" . $accountType] = array_unique(array_merge($doc["has" . $accountType], $accounts)); + $doc[ $valueType] = array_unique(array_merge($doc[ $valueType], $values)); + if ( $valueType == "rtkDescriptions") { + foreach ($values as $descriptionAgency => $descriptionValue) { + if ($descriptionAgency == $doc->value->name) { + $doc->value->description = $descriptionValue; + } + } + } } $db->save($doc); -}*/ +} ?> --- a/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py +++ b/documents/scrapers/0049d35216493c545ef5f7f000e6b252.py @@ -2,7 +2,14 @@ import os sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import genericScrapers - +import traceback +try: + import amonpy + amonpy.config.address = 'http://amon_instance:port' + amonpy.config.secret_key = 'the secret key from /etc/amon.conf' + amon_available = True +except ImportError: + amon_available = False class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): @@ -15,5 +22,27 @@ genericScrapers.GenericPDFDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericPDFDisclogScraper) - ScraperImplementation().doScrape() + try: + ScraperImplementation().doScrape() + except Exception, err: + sys.stderr.write('ERROR: %s\n' % str(err)) + print ‘Error Reason: ‘, err.__doc__ + print ‘Exception: ‘, err.__class__ + print traceback.format_exc() + if amon_available: + data = { + 'exception_class': '', + 'url': '', + 'backtrace': ['exception line ', 'another exception line'], + 'enviroment': '', + + # In 'data' you can add request information, session variables - it's a recursive + # dictionary, so you can literally add everything important for your specific case + # The dictionary doesn't have a specified structure, the keys below are only example + 'data': {'request': '', 'session': '', 'more': ''} + } + + amonpy.exception(data) + pass + --- a/getAgency.php +++ b/getAgency.php @@ -5,11 +5,11 @@ function displayValue($key, $value, $mode) { global $db, $schemas; if ($mode == "view") { - if (strpos($key, "_") === 0 || $key == "metadata") + if (strpos($key, "_") === 0 || $key == "metadata" || $key == "metaTags" || $key == "statistics") return; echo ""; - echo ""; + echo ""; if (isset($schemas['agency']["properties"][$key])) { echo $schemas['agency']["properties"][$key]['x-title'] . "
" . $schemas['agency']["properties"][$key]['description'] . ""; } @@ -202,8 +202,10 @@ try { $rows = $db->get_view("app", "byCanonicalName")->rows; //print_r($rows); + $rowCount = count($rows); echo '