From: Maxious Date: Thu, 22 Nov 2012 05:47:09 +0000 Subject: ausaid scraper X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=b33710797e56bcf3bd111f01e00a487ab199cecd --- ausaid scraper Former-commit-id: d2ca4331376808c570b588bad9a3c97df495d510 --- --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -1,4 +1,5 @@ " . $row->value->name . " (".$row->id.")\n"; - - echo ""; - if (isset($row->value->FOIDocumentsURL)) { - echo '' - .$row->value->FOIDocumentsURL.''; - } else { + echo "" . $row->value->name . ""; + if ($ENV == "DEV") + echo "
(" . $row->id . ")"; + echo "\n"; + + + echo ""; + if (isset($row->value->FOIDocumentsURL)) { + echo '' + . $row->value->FOIDocumentsURL . ''; + if ($ENV == "DEV") + echo '
(' + . 'view local copy)'; + } else { echo ""; } echo "\n"; if (isset($row->value->FOIDocumentsURL)) { - if (file_exists("./scrapers/".$row->id.'.py')) { - echo ""; - } else if (file_exists("./scrapers/".$row->id.'.txt')){ - echo ""; - } else { - echo ""; - } + if (file_exists("./scrapers/" . $row->id . '.py')) { + echo ""; + } else if (file_exists("./scrapers/" . $row->id . '.txt')) { + echo ""; + } else { + echo ""; + } } echo "\n"; } --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -82,8 +82,21 @@ descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) return + def getTitle(self, content, entry, doc): + doc.update({'title': content.string}) + return def getTable(self, soup): return soup.table + def getDate(self, content, entry, doc): + dtresult = cal.parseDateText(content.string) + if len(dtresult) == 2: + (dtdate,dtr) = dtresult + edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) + else: + edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return def doScrape(self): cal = pdt.Calendar() @@ -111,19 +124,13 @@ if doc == None: print "saving" - dtresult = cal.parseDateText(date.string) - if len(dtresult) == 2: - (dtdate,dtr) = dtresult - print dtdate - edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) - else: - edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string, - "date": edate,"title": title.string} + doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} if links != []: doc.update({'links': links}) + self.getTitle(title,row, doc) + self.getDate(date,row, doc) self.getDescription(description,row, doc) - if notes != None: + if notes != None: doc.update({ 'notes': notes.string}) foidocsdb.save(doc) else: --- a/getAgency.php +++ b/getAgency.php @@ -119,7 +119,7 @@ // edit? $obj = $db->get($_REQUEST['id']); - include_header($obj->name); + include_header(isset($obj->name) ? $obj->name : ""); //print_r($row); if (sizeof($_POST) > 0) { //print_r($_POST); @@ -200,14 +200,6 @@ // show all list include_header('Agencies'); try { - /* $rows = $db->get_view("app", "showNamesABNs")->rows; - //print_r($rows); - foreach ($rows as $row) { - // print_r($row); - echo '
  • ' . - (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) - . '
  • '; - } */ $rows = $db->get_view("app", "byCanonicalName")->rows; //print_r($rows); echo '"; --- a/include/common.inc.php +++ b/include/common.inc.php @@ -17,7 +17,7 @@ require_once $basePath.'lib/Requests/library/Requests.php'; Requests::register_autoloader(); - +$ENV = "DEV"; if (isset($_SERVER['SERVER_NAME']) && $_SERVER['SERVER_NAME'] != 'localhost') { require $basePath."lib/amon-php/amon.php"; @@ -25,6 +25,7 @@ 'protocol' => 'http', 'secret_key' => "I2LJ6dOMmlnXgVAkTPFXd5M3ejkga8Gd2FbBt6iqZdw")); Amon::setup_exception_handler(); + $ENV = "PROD"; } # Convert a stdClass to an Array. http://www.php.net/manual/en/language.types.object.php#102735