|
import sys, os |
|
|
|
import scrape |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
listurl = "http://data.gov.au/data/" |
|
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, |
|
listurl, "data", "AGIMO") |
|
soup = BeautifulSoup(datasetlisthtml) |
|
for atag in soup.find_all(class_='result-title'): |
|
if atag.has_key('href'): |
|
url = scrape.fullurl(listurl, atag['href']) |
|
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
|
url, "data", "AGIMO") |
|
hash = scrape.mkhash(scrape.canonurl(url)) |
|
doc = scrape.docsdb.get(hash) |
|
if "metadata" not in doc.keys(): |
|
doc['metadata'] = {} |
|
soup = BeautifulSoup(html) |
|
for metatag in soup.find_all('meta'): |
|
if metatag.has_key('name'): |
|
doc['metadata'][metatag['name']] = metatag['content'] |
|
for list in soup.find_all('dl'): |
|
last_title = "" |
|
for child in list.children: |
|
if str(type(child)) != "<class 'bs4.element.NavigableString'>": |
|
if child.name == 'dt' and child.string != None: |
|
last_title = child.string.strip() |
|
if child.name == 'dd': |
|
#print last_title |
|
if last_title == "Download": |
|
for item in child.find_all("li"): |
|
link = item.find("a") |
|
format = item.find(property="dc:format") |
|
linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(), |
|
"format": format.string.strip(), "size": format.next_sibling.string.strip()} |
|
doc['metadata'][last_title] = linkobj |
|
|
|
else: |
|
atags = child.find_all('a') |
|
if len(atags) < 2: |
|
[s.extract() for s in child(class_='viewAll')] |
|
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() |
|
else: |
|
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] |
|
print doc['metadata'] |
|
sys.exit("ggg") |