import sys, os |
import sys, os |
import time |
import time |
import scrape |
import scrape |
from bs4 import BeautifulSoup |
from bs4 import BeautifulSoup |
|
|
from unidecode import unidecode |
from unidecode import unidecode |
|
|
listurl = "http://data.gov.au/data/" |
listurl = "http://data.gov.au/data/" |
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, |
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, |
listurl, "data", "AGIMO") |
listurl, "data", "AGIMO") |
soup = BeautifulSoup(datasetlisthtml) |
soup = BeautifulSoup(datasetlisthtml) |
for atag in soup.find_all(class_='result-title'): |
for atag in soup.find_all(class_='result-title'): |
if atag.has_key('href'): |
if atag.has_key('href'): |
url = scrape.fullurl(listurl, atag['href']) |
url = scrape.fullurl(listurl, atag['href']) |
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb, |
url, "data", "AGIMO") |
url, "data", "AGIMO", False) |
hash = scrape.mkhash(scrape.canonurl(url)) |
hash = scrape.mkhash(scrape.canonurl(url)) |
doc = scrape.docsdb.get(hash) |
doc = scrape.docsdb.get(hash) |
if "metadata" not in doc.keys() or True: |
if "metadata" not in doc.keys() or True: |
doc['type'] = "dataset" |
doc['type'] = "dataset" |
doc['metadata'] = {} |
doc['metadata'] = {} |
soup = BeautifulSoup(html) |
soup = BeautifulSoup(html) |
for metatag in soup.find_all('meta'): |
for metatag in soup.find_all('meta'): |
if metatag.has_key('name'): |
if metatag.has_key('name'): |
doc['metadata'][metatag['name']] = metatag['content'] |
doc['metadata'][metatag['name']] = metatag['content'] |
for list in soup.find_all('dl'): |
for list in soup.find_all('dl'): |
last_title = "" |
last_title = "" |
for child in list.children: |
for child in list.children: |
if str(type(child)) != "<class 'bs4.element.NavigableString'>": |
if str(type(child)) != "<class 'bs4.element.NavigableString'>": |
if child.name == 'dt' and child.string != None: |
if child.name == 'dt' and child.string != None: |
last_title = child.string.strip() |
last_title = child.string.strip() |
if child.name == 'dd': |
if child.name == 'dd': |
#print last_title |
#print last_title |
if last_title == "Description": |
if last_title == "Description": |
doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore') |
doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore') |
elif last_title == "Download": |
elif last_title == "Download": |
doc['metadata'][last_title] = {} |
doc['metadata'][last_title] = [] |
for item in child.find_all("li"): |
for item in child.find_all("li"): |
link = item.find("a") |
link = item.find("a") |
format = item.find(property="dc:format") |
format = item.find(property="dc:format") |
linkobj = {"href":link['href'].replace("/bye?","").strip(), |
linkobj = {"href":link['href'].replace("/bye?","").strip(), |
"format": format.string.strip(), "size": format.next_sibling.string.strip()} |
"format": format.string.strip()} |
|
if format.next_sibling.string != None: |
|
linkobj["size"] = format.next_sibling.string.strip() |
if link.string != None: |
if link.string != None: |
linkobj["name"] = link.string.strip() |
linkobj["name"] = link.string.strip() |
doc['metadata'][last_title][] = linkobj |
doc['metadata'][last_title].append(linkobj) |
|
|
else: |
else: |
atags = child.find_all('a') |
atags = child.find_all('a') |
if len(atags) < 2: |
if len(atags) < 2: |
[s.extract() for s in child(class_='viewAll')] |
[s.extract() for s in child(class_='viewAll')] |
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() |
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() |
else: |
else: |
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] |
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] |
print doc['metadata'] |
print doc['metadata'] |
scrape.docsdb.save(doc) |
scrape.docsdb.save(doc) |
#time.sleep(2) |
#time.sleep(2) |
|
|