gitphp 0.2.9.1 :: disclosr.git/blobdiff

blob:a/documents/datagov.py -> blob:b/documents/datagov.py

import sys, os	import sys, os
	import time
import scrape	import scrape
from bs4 import BeautifulSoup	from bs4 import BeautifulSoup

	from unidecode import unidecode

listurl = "http://data.gov.au/data/"	listurl = "http://data.gov.au/data/"
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
listurl, "data", "AGIMO")	listurl, "data", "AGIMO")
soup = BeautifulSoup(datasetlisthtml)	soup = BeautifulSoup(datasetlisthtml)
for atag in soup.find_all(class_='result-title'):	for atag in soup.find_all(class_='result-title'):
if atag.has_key('href'):	if atag.has_key('href'):
url = scrape.fullurl(listurl, atag['href'])	url = scrape.fullurl(listurl, atag['href'])
(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,	(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
url, "data", "AGIMO")	url, "data", "AGIMO")
hash = scrape.mkhash(scrape.canonurl(url))	hash = scrape.mkhash(scrape.canonurl(url))
doc = scrape.docsdb.get(hash)	doc = scrape.docsdb.get(hash)
if "metadata" not in doc.keys():	if "metadata" not in doc.keys() or True:
	doc['type'] = "dataset"
doc['metadata'] = {}	doc['metadata'] = {}
soup = BeautifulSoup(html)	soup = BeautifulSoup(html)
for metatag in soup.find_all('meta'):	for metatag in soup.find_all('meta'):
if metatag.has_key('name'):	if metatag.has_key('name'):
doc['metadata'][metatag['name']] = metatag['content']	doc['metadata'][metatag['name']] = metatag['content']
for list in soup.find_all('dl'):	for list in soup.find_all('dl'):
last_title = ""	last_title = ""
for child in list.children:	for child in list.children:
if str(type(child)) != "<class 'bs4.element.NavigableString'>":	if str(type(child)) != "<class 'bs4.element.NavigableString'>":
if child.name == 'dt' and child.string != None:	if child.name == 'dt' and child.string != None:
last_title = child.string.strip()	last_title = child.string.strip()
if child.name == 'dd':	if child.name == 'dd':
#print last_title	#print last_title
if last_title == "Download":	if last_title == "Description":
for item in child.find_all("li"):	doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')
link = item.find("a")	elif last_title == "Download":
format = item.find(property="dc:format")	doc['metadata'][last_title] = {}
linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),	for item in child.find_all("li"):
"format": format.string.strip(), "size": format.next_sibling.string.strip()}	link = item.find("a")
doc['metadata'][last_title] = linkobj	format = item.find(property="dc:format")
	linkobj = {"href":link['href'].replace("/bye?","").strip(),
	"format": format.string.strip(), "size": format.next_sibling.string.strip()}
	if link.string != None:
	linkobj["name"] = link.string.strip()
	doc['metadata'][last_title][] = linkobj

else:
atags = child.find_all('a')
if len(atags) < 2:
[s.extract() for s in child(class_='viewAll')]
doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
else:	else:
doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]	atags = child.find_all('a')
print doc['metadata']	if len(atags) < 2:
sys.exit("ggg")	[s.extract() for s in child(class_='viewAll')]
	doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
	else:
	doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
	print doc['metadata']
	scrape.docsdb.save(doc)
	#time.sleep(2)