From: Alex Sadleir Date: Sat, 26 Jan 2013 15:00:09 +0000 Subject: beginning datagov scraper X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=6875c8d810daec261275743f92ed4c9db658a173 --- beginning datagov scraper Former-commit-id: a8775a64a3cdda480e4433742ed7ea6ca6a437ef --- --- a/documents/404.html +++ b/documents/404.html @@ -1,44 +1,156 @@ - - Page Not Found :( - + + Page Not Found :( + -

+

Not found :(

+

Sorry, but the page you were trying to view does not exist.

+

It looks like this was the result of either:

a mistyped address
an out-of-date link
a mistyped address
an out-of-date link

-

+

--- a/documents/agency.php +++ b/documents/agency.php @@ -12,8 +12,11 @@ include_header_documents((isset($_REQUEST['id']) ? $idtoname[$_REQUEST['id']] : 'Entries by Agency')); $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); ?> -

Read all the information released by Australian Federal Government agencies under the FOI Act in one place!

-

All Agencies RSS Feed
+

Read all the information released by Australian Federal Government agencies under the FOI Act + in one place! +

+

All Agencies RSS Feed +

- + - - + + - - - - + + + --- /dev/null +++ b/documents/datagov.py @@ -1,1 +1,48 @@ +import sys, os +import scrape +from bs4 import BeautifulSoup + + +listurl = "http://data.gov.au/data/" +(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, + listurl, "data", "AGIMO") +soup = BeautifulSoup(datasetlisthtml) +for atag in soup.find_all(class_='result-title'): + if atag.has_key('href'): + url = scrape.fullurl(listurl, atag['href']) + (url, mime_type, html) = scrape.fetchURL(scrape.docsdb, + url, "data", "AGIMO") + hash = scrape.mkhash(scrape.canonurl(url)) + doc = scrape.docsdb.get(hash) + if "metadata" not in doc.keys(): + doc['metadata'] = {} + soup = BeautifulSoup(html) + for metatag in soup.find_all('meta'): + if metatag.has_key('name'): + doc['metadata'][metatag['name']] = metatag['content'] + for list in soup.find_all('dl'): + last_title = "" + for child in list.children: + if str(type(child)) != "": + if child.name == 'dt' and child.string != None: + last_title = child.string.strip() + if child.name == 'dd': + #print last_title + if last_title == "Download": + for item in child.find_all("li"): + link = item.find("a") + format = item.find(property="dc:format") + linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(), + "format": format.string.strip(), "size": format.next_sibling.string.strip()} + doc['metadata'][last_title] = linkobj + + else: + atags = child.find_all('a') + if len(atags) < 2: + [s.extract() for s in child(class_='viewAll')] + doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() + else: + doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] + print doc['metadata'] + sys.exit("ggg") --- a/documents/date.php +++ b/documents/date.php @@ -5,8 +5,11 @@ include_once('../include/common.inc.php'); $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); ?> -

Read all the information released by Australian Federal Government agencies under the FOI Act in one place!

-

All Agencies RSS Feed
+

Read all the information released by Australian Federal Government agencies under the FOI Act in + one place! +

+

All Agencies RSS Feed +
get_db('disclosr-agencies'); --- a/documents/disclogsList.php +++ b/documents/disclogsList.php @@ -34,10 +34,10 @@ if (isset($row->value->FOIDocumentsURL)) { $disclogs++; echo '' - . $row->value->FOIDocumentsURL . ''; + . $row->value->FOIDocumentsURL . ''; if ($ENV == "DEV") echo '
(' - . 'view local copy)'; + . 'view local copy)'; } else { echo "✘"; } @@ -49,11 +49,11 @@ } else if (file_exists("./scrapers/" . $row->id . '.txt')) { if (trim(file_get_contents("./scrapers/" . $row->id . '.txt')) == "no disclog") { echo "◎"; - $yellow++; + $yellow++; } else { echo file_get_contents("./scrapers/" . $row->id . '.txt'); - echo "▬"; - $orange++; + echo "▬"; + $orange++; } } else { echo "✘"; @@ -69,7 +69,7 @@ } echo ""; echo $agencies . " agencies, " . round(($disclogs / $agencies) * 100) . "% with disclosure logs; " - . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers "; + . round(($green / $disclogs) * 100) . "% logs with scrapers " . round(($red / $disclogs) * 100) . "% logs without scrapers " . round(($orange / $disclogs) * 100) . "% logs Work-In-Progress scrapers "; include_footer_documents(); ?> --- a/documents/exportAll.csv.php +++ b/documents/exportAll.csv.php @@ -39,7 +39,7 @@ if (is_array($agencyArray[$fieldName])) { $row[] = implode(";", $agencyArray[$fieldName]); } else { - $row[] = str_replace(Array("\n", '"', "\t"),"",$agencyArray[$fieldName]); + $row[] = str_replace(Array("\n", '"', "\t"), "", $agencyArray[$fieldName]); } } else { $row[] = ""; --- /dev/null +++ b/documents/gazette.py --- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -1,5 +1,6 @@ import sys import os + sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) import scrape from bs4 import BeautifulSoup @@ -51,12 +52,12 @@ """ do the scraping """ return + class GenericHTMLDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) + self.getURL(), "foidocuments", self.getAgencyID()) content = rcontent dochash = scrape.mkhash(content) doc = foidocsdb.get(dochash) @@ -66,33 +67,32 @@ last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL()) if last_attach != None: html_diff = difflib.HtmlDiff() - description = description + "\nChanges: " - description = description + html_diff.make_table(last_attach.read().split('\n'), - content.split('\n')) + diff = html_diff.make_table(last_attach.read().split('\n'), + content.split('\n')) edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description, "diff": diff} foidocsdb.save(doc) else: print "already saved" + class GenericPDFDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) + self.getURL(), "foidocuments", self.getAgencyID()) laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec='utf-8', - laparams=laparams) + laparams=laparams) fp = StringIO() fp.write(content) process_pdf(rsrcmgr, device, fp, set(), caching=True, - check_extractable=True) + check_extractable=True) description = outfp.getvalue() fp.close() device.close() @@ -103,19 +103,18 @@ print "saving " + dochash edate = date.today().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" class GenericDOCXDisclogScraper(GenericDisclogScraper): - def doScrape(self): foidocsdb = scrape.couch['disclosr-foidocuments'] (url, mime_type, content) = scrape.fetchURL(scrape.docsdb - , self.getURL(), "foidocuments", self.getAgencyID()) + , self.getURL(), "foidocuments", self.getAgencyID()) mydoc = zipfile.ZipFile(file) xmlcontent = mydoc.read('word/document.xml') document = etree.fromstring(xmlcontent) @@ -125,7 +124,7 @@ newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) - ## Print our documnts test with two newlines under each paragraph + ## Print our documnts test with two newlines under each paragraph description = '\n\n'.join(newparatextlist).strip(' \t\n\r') dochash = scrape.mkhash(description) doc = foidocsdb.get(dochash) @@ -134,42 +133,42 @@ print "saving " + dochash edate = time().strftime("%Y-%m-%d") doc = {'_id': dochash, 'agencyID': self.getAgencyID() - , 'url': self.getURL(), 'docID': dochash, - "date": edate, "title": "Disclosure Log Updated", "description": description} + , 'url': self.getURL(), 'docID': dochash, + "date": edate, "title": "Disclosure Log Updated", "description": description} foidocsdb.save(doc) else: print "already saved" class GenericRSSDisclogScraper(GenericDisclogScraper): - - def doScrape(self): - foidocsdb = scrape.couch['disclosr-foidocuments'] - (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, - self.getURL(), "foidocuments", self.getAgencyID()) - feed = feedparser.parse(content) - for entry in feed.entries: - #print entry - print entry.id - dochash = scrape.mkhash(entry.id) - doc = foidocsdb.get(dochash) - #print doc - if doc is None: - print "saving " + dochash - edate = datetime.fromtimestamp( - mktime(entry.published_parsed)).strftime("%Y-%m-%d") - doc = {'_id': dochash, 'agencyID': self.getAgencyID(), - 'url': entry.link, 'docID': entry.id, - "date": edate, "title": entry.title} - self.getDescription(entry, entry, doc) - foidocsdb.save(doc) - else: - print "already saved" - - def getDescription(self, content, entry, doc): - """ get description from rss entry""" - doc.update({'description': content.summary}) - return + def doScrape(self): + foidocsdb = scrape.couch['disclosr-foidocuments'] + (url, mime_type, content) = scrape.fetchURL(scrape.docsdb, + self.getURL(), "foidocuments", self.getAgencyID()) + feed = feedparser.parse(content) + for entry in feed.entries: + #print entry + print entry.id + dochash = scrape.mkhash(entry.id) + doc = foidocsdb.get(dochash) + #print doc + if doc is None: + print "saving " + dochash + edate = datetime.fromtimestamp( + mktime(entry.published_parsed)).strftime("%Y-%m-%d") + doc = {'_id': dochash, 'agencyID': self.getAgencyID(), + 'url': entry.link, 'docID': entry.id, + "date": edate, "title": entry.title} + self.getDescription(entry, entry, doc) + foidocsdb.save(doc) + else: + print "already saved" + + def getDescription(self, content, entry, doc): + """ get description from rss entry""" + doc.update({'description': content.summary}) + + return class GenericOAICDisclogScraper(GenericDisclogScraper): @@ -187,7 +186,7 @@ """ get description from rss entry""" descriptiontxt = "" for string in content.stripped_strings: - descriptiontxt = descriptiontxt + " \n" + string + descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) def getTitle(self, content, entry, doc): @@ -215,7 +214,7 @@ if atag.has_key('href'): links.append(scrape.fullurl(content, atag['href'])) if links != []: - doc.update({'links': links}) + doc.update({'links': links}) return def doScrape(self): @@ -232,7 +231,7 @@ columns = row.find_all('td') if len(columns) is self.getColumnCount(): (id, date, title, - description, notes) = self.getColumns(columns) + description, notes) = self.getColumns(columns) print self.remove_control_chars( ''.join(id.stripped_strings)) if id.string is None: @@ -248,27 +247,29 @@ if doc is None: print "saving " + dochash doc = {'_id': dochash, - 'agencyID': self.getAgencyID(), - 'url': self.getURL(), - 'docID': (''.join(id.stripped_strings))} + 'agencyID': self.getAgencyID(), + 'url': self.getURL(), + 'docID': (''.join(id.stripped_strings))} self.getLinks(self.getURL(), row, doc) self.getTitle(title, row, doc) self.getDate(date, row, doc) self.getDescription(description, row, doc) if notes is not None: - doc.update({ 'notes': ( + doc.update({'notes': ( ''.join(notes.stripped_strings))}) - badtitles = ['-','Summary of FOI Request' - , 'FOI request(in summary form)' - , 'Summary of FOI request received by the ASC', -'Summary of FOI request received by agency/minister', -'Description of Documents Requested','FOI request', -'Description of FOI Request','Summary of request','Description','Summary', -'Summary of FOIrequest received by agency/minister','Summary of FOI request received','Description of FOI Request',"FOI request",'Results 1 to 67 of 67'] + badtitles = ['-', 'Summary of FOI Request' + , 'FOI request(in summary form)' + , 'Summary of FOI request received by the ASC', + 'Summary of FOI request received by agency/minister', + 'Description of Documents Requested', 'FOI request', + 'Description of FOI Request', 'Summary of request', 'Description', 'Summary', + 'Summary of FOIrequest received by agency/minister', + 'Summary of FOI request received', 'Description of FOI Request', + "FOI request", 'Results 1 to 67 of 67'] if doc['title'] not in badtitles\ and doc['description'] != '': - print "saving" - foidocsdb.save(doc) + print "saving" + foidocsdb.save(doc) else: print "already saved " + dochash --- a/documents/index.php +++ b/documents/index.php @@ -5,8 +5,11 @@ $endkey = (isset($_REQUEST['end_key']) ? $_REQUEST['end_key'] : '9999-99-99'); $enddocid = (isset($_REQUEST['end_docid']) ? $_REQUEST['end_docid'] : null); ?> -

Read all the information released by Australian Federal Government agencies under the FOI Act in one place!

-

All Agencies RSS Feed
+

Read all the information released by Australian Federal Government agencies under the FOI Act in + one place! +

+

All Agencies RSS Feed +
get_db('disclosr-agencies'); @@ -16,7 +19,7 @@ } $foidocsdb = $server->get_db('disclosr-foidocuments'); try { - $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20,null, $enddocid)->rows; + $rows = $foidocsdb->get_view("app", "byDate", Array($endkey, '0000-00-00'), true, 20, null, $enddocid)->rows; if ($rows) { foreach ($rows as $key => $row) { echo displayLogEntry($row, $idtoname); --- a/documents/redirect.php +++ b/documents/redirect.php @@ -1,18 +1,18 @@ --- a/documents/rss.xml.php +++ b/documents/rss.xml.php @@ -23,9 +23,9 @@ $title = 'All Agencies'; } //Use wrapper functions for common channelelements -$TestFeed->setTitle('disclosurelo.gs Newest Entries - '.$title); -$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php'.(isset($_REQUEST['id'])? '?id='.$_REQUEST['id'] : '')); -$TestFeed->setDescription('disclosurelo.gs Newest Entries - '.$title); +$TestFeed->setTitle('disclosurelo.gs Newest Entries - ' . $title); +$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php' . (isset($_REQUEST['id']) ? '?id=' . $_REQUEST['id'] : '')); +$TestFeed->setDescription('disclosurelo.gs Newest Entries - ' . $title); $TestFeed->setChannelElement('language', 'en-us'); $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time())); --- a/documents/scrape.py +++ b/documents/scrape.py @@ -16,6 +16,7 @@ def mkhash(input): return hashlib.md5(input).hexdigest().encode("utf-8") + def canonurl(url): r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or '' if the URL looks invalid. @@ -67,10 +68,11 @@ url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) return url[:4096] -def fullurl(url,href): - href = href.replace(" ","%20") - href = re.sub('#.*$','',href) - return urljoin(url,href) + +def fullurl(url, href): + href = href.replace(" ", "%20") + href = re.sub('#.*$', '', href) + return urljoin(url, href) #http://diveintopython.org/http_web_services/etags.html class NotModifiedHandler(urllib2.BaseHandler): @@ -79,37 +81,39 @@ addinfourl.code = code return addinfourl -def getLastAttachment(docsdb,url): + +def getLastAttachment(docsdb, url): hash = mkhash(url) doc = docsdb.get(hash) if doc != None: last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc,last_attachment_fname) + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) return last_attachment else: return None + def fetchURL(docsdb, url, fieldName, agencyID, scrape_again=True): url = canonurl(url) hash = mkhash(url) req = urllib2.Request(url) - print "Fetching %s (%s)" % (url,hash) + print "Fetching %s (%s)" % (url, hash) if url.startswith("mailto") or url.startswith("javascript") or url.startswith("#") or url == None or url == "": print "Not a valid HTTP url" - return (None,None,None) + return (None, None, None) doc = docsdb.get(hash) if doc == None: - doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName':fieldName, 'type': 'website'} + doc = {'_id': hash, 'agencyID': agencyID, 'url': url, 'fieldName': fieldName, 'type': 'website'} else: - if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60*24*14*1000): - print "Uh oh, trying to scrape URL again too soon!"+hash + if (('page_scraped' in doc) and (time.time() - doc['page_scraped']) < 60 * 24 * 14 * 1000): + print "Uh oh, trying to scrape URL again too soon!" + hash last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc,last_attachment_fname) + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) content = last_attachment - return (doc['url'],doc['mime_type'],content.read()) + return (doc['url'], doc['mime_type'], content.read()) if scrape_again == False: print "Not scraping this URL again as requested" - return (doc['url'],doc['mime_type'],content.read()) + return (doc['url'], doc['mime_type'], content.read()) req.add_header("User-Agent", "Mozilla/4.0 (compatible; Prometheus webspider; owner maxious@lambdacomplex.org)") #if there is a previous version stored in couchdb, load caching helper tags @@ -120,9 +124,7 @@ opener = urllib2.build_opener(NotModifiedHandler()) try: - #default_timeout = 12 - #socket.setdefaulttimeout(default_timeout) - url_handle = opener.open(req,None,3) + url_handle = opener.open(req, None, 20) doc['url'] = url_handle.geturl() # may have followed a redirect to a new url headers = url_handle.info() # the addinfourls have the .info() too doc['etag'] = headers.getheader("ETag") @@ -135,75 +137,76 @@ doc['file_size'] = headers.getheader("Content-Length") content_type = headers.getheader("Content-Type") if content_type != None: - doc['mime_type'] = content_type.split(";")[0] + doc['mime_type'] = content_type.split(";")[0] else: - (type,encoding) = mimetypes.guess_type(url) - doc['mime_type'] = type + (type, encoding) = mimetypes.guess_type(url) + doc['mime_type'] = type if hasattr(url_handle, 'code'): if url_handle.code == 304: - print "the web page has not been modified"+hash + print "the web page has not been modified" + hash last_attachment_fname = doc["_attachments"].keys()[-1] - last_attachment = docsdb.get_attachment(doc,last_attachment_fname) + last_attachment = docsdb.get_attachment(doc, last_attachment_fname) content = last_attachment - return (doc['url'],doc['mime_type'],content.read()) + return (doc['url'], doc['mime_type'], content.read()) else: print "new webpage loaded" content = url_handle.read() docsdb.save(doc) doc = docsdb.get(hash) # need to get a _rev - docsdb.put_attachment(doc, content, str(time.time())+"-"+os.path.basename(url), doc['mime_type']) + docsdb.put_attachment(doc, content, str(time.time()) + "-" + os.path.basename(url), doc['mime_type']) return (doc['url'], doc['mime_type'], content) #store as attachment epoch-filename except (urllib2.URLError, socket.timeout) as e: - print "error!" - error = "" - if hasattr(e, 'reason'): - error = "error %s in downloading %s" % (str(e.reason), url) - elif hasattr(e, 'code'): - error = "error %s in downloading %s" % (e.code, url) - print error - doc['error'] = error - docsdb.save(doc) - return (None,None,None) - + print "error!" + error = "" + if hasattr(e, 'reason'): + error = "error %s in downloading %s" % (str(e.reason), url) + elif hasattr(e, 'code'): + error = "error %s in downloading %s" % (e.code, url) + print error + doc['error'] = error + docsdb.save(doc) + return (None, None, None) def scrapeAndStore(docsdb, url, depth, fieldName, agencyID): - (url,mime_type,content) = fetchURL(docsdb, url, fieldName, agencyID) + (url, mime_type, content) = fetchURL(docsdb, url, fieldName, agencyID) badURLs = ["http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report"] if content != None and depth > 0 and url != "http://www.ausport.gov.au/supporting/funding/grants_and_scholarships/grant_funding_report": - if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml": - # http://www.crummy.com/software/BeautifulSoup/documentation.html - soup = BeautifulSoup(content) - navIDs = soup.findAll(id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) - for nav in navIDs: - print "Removing element", nav['id'] + if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml": + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(content) + navIDs = soup.findAll( + id=re.compile('nav|Nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')) + for nav in navIDs: + print "Removing element", nav['id'] + nav.extract() + navClasses = soup.findAll( + attrs={'class': re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) + for nav in navClasses: + print "Removing element", nav['class'] nav.extract() - navClasses = soup.findAll(attrs={'class' : re.compile('nav|menu|bar|left|right|sidebar|more-links|breadcrumb|footer|header')}) - for nav in navClasses: - print "Removing element", nav['class'] - nav.extract() - links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) - linkurls = set([]) - for link in links: - if link.has_key("href"): - if link['href'].startswith("http"): - # lets not do external links for now - # linkurls.add(link['href']) - None - if link['href'].startswith("mailto"): - # not http - None - if link['href'].startswith("javascript"): - # not http - None - else: - # remove anchors and spaces in urls - linkurls.add(fullurl(url,link['href'])) - for linkurl in linkurls: - #print linkurl - scrapeAndStore(docsdb, linkurl, depth-1, fieldName, agencyID) + links = soup.findAll('a') # soup.findAll('a', id=re.compile("^p-")) + linkurls = set([]) + for link in links: + if link.has_key("href"): + if link['href'].startswith("http"): + # lets not do external links for now + # linkurls.add(link['href']) + None + if link['href'].startswith("mailto"): + # not http + None + if link['href'].startswith("javascript"): + # not http + None + else: + # remove anchors and spaces in urls + linkurls.add(fullurl(url, link['href'])) + for linkurl in linkurls: + #print linkurl + scrapeAndStore(docsdb, linkurl, depth - 1, fieldName, agencyID) #couch = couchdb.Server('http://192.168.1.148:5984/') couch = couchdb.Server('http://192.168.1.113:5984/') @@ -218,17 +221,17 @@ print agency['name'] for key in agency.keys(): if key == "FOIDocumentsURL" and "status" not in agency.keys() and False: - scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) + scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) if key == 'website' and True: - scrapeAndStore(docsdb, agency[key],0,key,agency['_id']) - if "metadata" not in agency.keys(): - agency['metadata'] = {} + scrapeAndStore(docsdb, agency[key], 0, key, agency['_id']) + if "metadata" not in agency.keys(): + agency['metadata'] = {} agency['metadata']['lastScraped'] = time.time() if key.endswith('URL') and False: print key depth = 1 if 'scrapeDepth' in agency.keys(): depth = agency['scrapeDepth'] - scrapeAndStore(docsdb, agency[key],depth,key,agency['_id']) + scrapeAndStore(docsdb, agency[key], depth, key, agency['_id']) agencydb.save(agency) --- a/documents/search.php +++ b/documents/search.php @@ -2,23 +2,23 @@ include_once('include/common.inc.php'); include_header('Search'); ?> -

-

Search

-

- +

+

Search

+

+ body); $db = $server->get_db('disclosr-documents'); foreach ($results->rows as $result) { //print_r($result); - //$row = $db->get($result->id); - echo $result->doc->_id." ".$result->doc->url."
".PHP_EOL; + //$row = $db->get($result->id); + echo $result->doc->_id . " " . $result->doc->url . "
" . PHP_EOL; } } include_footer(); --- a/documents/template.inc.php +++ b/documents/template.inc.php @@ -1,101 +1,109 @@ - - - + + + - - + + - Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?> - + Australian Disclosure Logs<?php if ($title != "") echo " - $title"; ?> + - - - - - + + + + + - - - - - - - - - - - - - - - - -

- + + + - - + + + + + + + + + + + +

+ +

+

+ + + var _gaq = _gaq || []; _gaq.push(['_setAccount', 'UA-12341040-4']); @@ -113,32 +121,33 @@ })(); "; - } - ?> - - - + } + ?> + + + - + - $length) { //limit hit! @@ -154,14 +163,15 @@ return $string; } -function displayLogEntry($row, $idtoname) { +function displayLogEntry($row, $idtoname) +{ $result = ""; $result .= '

'; - $result .= '

' . $row->value->date . ": " . truncate($row->value->title, 120) . ""; + $result .= '
' . $row->value->date . ": " . truncate($row->value->title, 120) . ""; $result .= ' (' . $idtoname[$row->value->agencyID] . ')
'; $result .= "
Title: " . $row->value->title . "
"; if (isset($row->value->description)) { - $result .= str_replace("\n", "
", preg_replace("/(^[\r\n]|[\r\n]+)[\s\t][\r\n]+/", "",trim($row->value->description))); + $result .= str_replace("\n", "
", preg_replace("/(^[\r\n]|[\r\n]+)[\s\t][\r\n]+/", "", trim($row->value->description))); } if (isset($row->value->notes)) { $result .= "
Note: " . $row->value->notes; @@ -171,7 +181,7 @@ if (isset($row->value->links)) { $result .= '

Links/Documents

' . htmlspecialchars ( $link) . "
' . htmlspecialchars($link) . "

"; --- a/documents/view.php +++ b/documents/view.php @@ -14,11 +14,11 @@ } $foidocsdb = $server->get_db('disclosr-foidocuments'); try { - $obj = new stdClass(); + $obj = new stdClass(); $obj->value = $foidocsdb->get($_REQUEST['id']); include_header_documents($obj->value->title); -echo displayLogEntry($obj,$idtoname); + echo displayLogEntry($obj, $idtoname); } catch (SetteeRestClientException $e) { setteErrorHandler($e); --- a/documents/viewDocument.php +++ b/documents/viewDocument.php @@ -4,7 +4,7 @@ $hash = $_REQUEST['hash']; $docsdb = $server->get_db('disclosr-documents'); try { -$doc = object_to_array($docsdb->get($hash)); + $doc = object_to_array($docsdb->get($hash)); } catch (SetteeRestClientException $e) { setteErrorHandler($e); @@ -15,7 +15,7 @@ $attachments = $doc['_attachments']; $attachment_filenames = array_keys($attachments); //print_r($attachments); -$url = $serverAddr.'disclosr-documents/'.$hash.'/'.urlencode($attachment_filenames[0]); +$url = $serverAddr . 'disclosr-documents/' . $hash . '/' . urlencode($attachment_filenames[0]); //echo $url; $request = Requests::get($url); echo ($request->body);