From: maxious Date: Sun, 11 Mar 2012 10:39:08 +0000 Subject: Testing facebook imports X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=758723a1e47a93641ef639806f8d72c2dfeeb1f7 --- Testing facebook imports Former-commit-id: 7c36c79b53a3bdd3ff5acaec5e5ceadb4b73aff1 --- --- a/admin/importGov2RegisterRSSFacebookTwitter.php +++ b/admin/importGov2RegisterRSSFacebookTwitter.php @@ -1,17 +1,51 @@ create_db('disclosr-agencies'); -} catch (SetteeRestClientException $e) { - setteErrorHandler($e); + +$db = $server->get_db('disclosr-agencies'); +$rows = $db->get_view("app", "byName")->rows; +$nametoid = Array(); +$accounts = Array(); +foreach ($rows as $row) { + $nametoid[trim($row->key)] = $row->value; } -$db = $server->get_db('disclosr-agencies'); -createAgencyDesignDoc(); -// twitter https://docs.google.com/spreadsheet/fm?id=tsJVd9EYoAjbl014y3qMgWg.03918275400592898296.8568379511161083736&hl=en&fmcmd=5&gid=0 -// RSS https://docs.google.com/spreadsheet/fm?id=tbqjwIDHKHiVYF_glJ93GgA.03918275400592898296.8789688748524615194&authkey=CJDP-uQG&hl=en_GB&fmcmd=5&gid=0 -// facebook https://docs.google.com/spreadsheet/fm?id=tkcqoo9wrgzNWmoANuVhsBw.03918275400592898296.3040387705062056060&authkey=CKzl7r0I&hl=en_GB&fmcmd=5&gid=0 +function extractCSVAccounts($url, $nameField, $accountField, $filter) { + global $accounts, $nametoid; + $request = Requests::get($url); + $Data = str_getcsv($request->body, "\n"); //parse the rows + $headers = Array(); + foreach ($Data as $num => $line) { + $Row = str_getcsv($line, ","); + if ($num == 0) { + + } else if ($num == 1) { + $headers = $Row; + //print_r($headers); + } else { + if (isset($Row[array_search($nameField, $headers)])) { + $agencyName = $Row[array_search($nameField, $headers)]; + if (!$filter || $Row[array_search("State", $headers)] == "NAT") { + if (!in_array(trim($agencyName), array_keys($nametoid))) { + echo "$agencyName missing" . PHP_EOL; + } else { + // echo $Row[array_search($nameField, $headers)] . PHP_EOL; + } + } + } else { + //echo "error finding agency" . $line . PHP_EOL; + } + } + } +} + +// http://agimo.govspace.gov.au/page/gov2register/ +// twitter +//extractCSVAccounts("https://docs.google.com/spreadsheet/pub?key=0Ap1exl80wB8OdHNKVmQ5RVlvQWpibDAxNHkzcU1nV2c&single=true&gid=0&output=csv", "Agency/Body/Event", "", true); +// RSS +// https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGJxandJREhLSGlWWUZfZ2xKOTNHZ0E&output=csv +// facebook +extractCSVAccounts("https://docs.google.com/spreadsheet/pub?hl=en_GB&hl=en_GB&key=0Ah41IAK0HzSTdGtjcW9vOXdyZ3pOV21vQU51VmhzQnc&single=true&gid=0&output=csv","Agency","Name"); /* * http://australia.gov.au/news-and-media/media-release-rss-feeds --- a/couchdb/settee +++ b/couchdb/settee --- a/getAgency.php +++ b/getAgency.php @@ -14,10 +14,10 @@ echo "
    "; foreach ($value as $subkey => $subvalue) { if (isset($schemas['agency']["properties"][$key]['x-property'])) { - echo '
  1. '; - } else { - echo "
  2. "; - } + echo '
  3. '; + } else { + echo "
  4. "; + } echo "$subvalue
  5. "; } echo "
"; @@ -27,11 +27,11 @@ } else { echo ""; } - if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { - echo "view"; - } else { - echo "$value"; - } + if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { + echo "view"; + } else { + echo "$value"; + } } echo ""; } @@ -53,12 +53,12 @@ } else if ($key == "parentOrg") { echo ""; - } else { + } else { echo ""; if ((strpos($key, "URL") > 0 || $key == 'website') && $value != "") { echo "view"; @@ -69,7 +69,7 @@ } } } - // +// } function addDefaultFields($row) { @@ -78,13 +78,17 @@ foreach ($defaultFields as $defaultField) { if (!isset($row[$defaultField])) { if ($schemas['agency']['properties'][$defaultField]['type'] == "string") { - - $row[$defaultField] = ""; - + $row[$defaultField] = ""; } if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { - $row[$defaultField] = Array(""); + } + } else if ($schemas['agency']['properties'][$defaultField]['type'] == "array") { + if (is_array($row[$defaultField])) { + $row[$defaultField][] = ""; + } else { + $value = $row[$defaultField]; + $row[$defaultField] = Array($value); } } } @@ -94,20 +98,28 @@ $db = $server->get_db('disclosr-agencies'); if (isset($_REQUEST['id'])) { - //get an agency record as json/html, search by name/abn/id +//get an agency record as json/html, search by name/abn/id // by name = startkey="Ham"&endkey="Ham\ufff0" // edit? $row = $db->get($_REQUEST['id']); - //print_r($row); +//print_r($row); if (sizeof($_POST) > 0) { - //print_r($_POST); +//print_r($_POST); foreach ($_POST as $postkey => $postvalue) { if ($postvalue == "") { unset($_POST[$postkey]); } - if (is_array($postvalue) && count($postvalue) == 1 && $postvalue[0] == "") { - unset($_POST[$postkey]); + if (is_array($postvalue)) { + if (count($postvalue) == 1 && $postvalue[0] == "") { + unset($_POST[$postkey]); + } else { + foreach ($_POST[$postkey] as $key => &$value) { + if ($value == "") { + unset($_POST[$postkey][$key]); + } + } + } } } if (isset($_POST['_id']) && $db->get_rev($_POST['_id']) == $_POST['_rev']) { @@ -124,9 +136,9 @@ if ($mode == "edit") { $row = addDefaultFields(object_to_array($row)); } else { - $row = object_to_array($row); - } - + $row = object_to_array($row); + } + if ($mode == "view") { echo '
'; echo '"; @@ -153,44 +165,44 @@ }; - $value) { - echo displayValue($key, $value, $mode); - } - if ($mode == "view") { - echo "

' . $row['name'] . "

"; - } - if ($mode == "edit") { - echo ''; - } -} else { - - try { - /* $rows = $db->get_view("app", "showNamesABNs")->rows; - //print_r($rows); - foreach ($rows as $row) { - // print_r($row); - echo '
  • ' . - (isset($row->value->name) && $row->value->name != "" ? $row->value->name : "NO NAME " . $row->value->abn) - . '
  • '; - } */ - $rows = $db->get_view("app", "byName")->rows; - //print_r($rows); -echo '"; + } catch (SetteeRestClientException $e) { + setteErrorHandler($e); + } + } + include_footer(); + ?> + --- a/schemas/agency.json.php +++ b/schemas/agency.json.php @@ -33,6 +33,8 @@ "appointmentsURL" => Array("type" => "string", "required" => true, "x-title" => "Agency Appointments/Boards", "description" => "Departmental and agency appointments and vacancies , mandated by the Senate"), "advertisingURL" => Array("type" => "string", "required" => true, "x-title" => "Approved Advertising Campaigns", "description" => " Agency advertising and public information projects, mandated by the Senate "), "hasRSS" => Array("type" => "array", "required" => true, "x-title" => "Has RSS", "description" => ""), + "hasBlog" => Array("type" => "array", "required" => true, "x-title" => "Has Blog", "description" => ""), + "hasMobileApp" => Array("type" => "array", "required" => true, "x-title" => "Has Mobile App", "description" => ""), "hasMailingList" => Array("type" => "array", "required" => true, "x-title" => "Has Mailing List", "description" => "", "items" => Array("type" => "string")), "hasTwitter" => Array("type" => "array", "required" => true, "x-title" => "Has Twitter", "description" => "", --- a/scrape.py +++ b/scrape.py @@ -77,7 +77,7 @@ print "Fetching %s" % url if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" - return (None,None) + return (None,None,None) doc = docsdb.get(hash) if doc == None: doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName} @@ -86,13 +86,14 @@ print "Uh oh, trying to scrape URL again too soon!" last_attachment_fname = doc["_attachments"].keys()[-1] last_attachment = docsdb.get_attachment(doc,last_attachment_fname) - return (doc['mime_type'],last_attachment) + return (doc['url'],doc['mime_type'],last_attachment) if scrape_again == False: print "Not scraping this URL again as requested" - return (None,None) + return (None,None,None) time.sleep(3) # wait 3 seconds to give webserver time to recover + req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags if doc.has_key('etag'): req.add_header("If-None-Match", doc['etag']) @@ -102,12 +103,14 @@ opener = urllib2.build_opener(NotModifiedHandler()) try: url_handle = opener.open(req) + doc['url'] = url_handle.geturl() # may have followed a redirect to a new url headers = url_handle.info() # the addinfourls have the .info() too doc['etag'] = headers.getheader("ETag") doc['last_modified'] = headers.getheader("Last-Modified") doc['date'] = headers.getheader("Date") doc['page_scraped'] = time.time() doc['web_server'] = headers.getheader("Server") + doc['via'] = headers.getheader("Via") doc['powered_by'] = headers.getheader("X-Powered-By") doc['file_size'] = headers.getheader("Content-Length") content_type = headers.getheader("Content-Type") @@ -119,13 +122,13 @@ if hasattr(url_handle, 'code'): if url_handle.code == 304: print "the web page has not been modified" - return (None,None) + return (None,None,None) else: content = url_handle.read() docsdb.save(doc) doc = docsdb.get(hash) # need to get a _rev docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) - return (doc['mime_type'], content) + return (doc['url'], doc['mime_type'], content) #store as attachment epoch-filename except urllib2.URLError as e: error = "" @@ -136,21 +139,22 @@ print error doc['error'] = error docsdb.save(doc) - return (None,None) + return (None,None,None) def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): - (mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) - if content != None and depth > 0: + (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) + badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] + if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": # http://www.crummy.com/software/BeautifulSoup/documentation.html soup = BeautifulSoup(content) - navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar')) + navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) for nav in navIDs: print "Removing element", nav['id'] nav.extract() - navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar')}) + navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) for nav in navClasses: print "Removing element", nav['class'] nav.extract() @@ -169,7 +173,10 @@ # not http None else: - linkurls.add(urljoin(url,link['href'].replace(" ","%20"))) + # remove anchors and spaces in urls + link['href'] = link['href'].replace(" ","%20") + link['href'] = re.sub('#.*$','',link['href']) + linkurls.add(urljoin(url,link['href'])) for linkurl in linkurls: #print linkurl scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID)