gitphp 0.2.9.1 :: disclosr.git/blobdiff

import sys, os

import scrape

from bs4 import BeautifulSoup

listurl = "http://data.gov.au/data/"

(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,

listurl, "data", "AGIMO")

soup = BeautifulSoup(datasetlisthtml)

for atag in soup.find_all(class_='result-title'):

if atag.has_key('href'):

url = scrape.fullurl(listurl, atag['href'])

(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,

url, "data", "AGIMO")

hash = scrape.mkhash(scrape.canonurl(url))

doc = scrape.docsdb.get(hash)

if "metadata" not in doc.keys():

doc['metadata'] = {}

soup = BeautifulSoup(html)

for metatag in soup.find_all('meta'):

if metatag.has_key('name'):

doc['metadata'][metatag['name']] = metatag['content']

for list in soup.find_all('dl'):

last_title = ""

for child in list.children:

if str(type(child)) != "<class 'bs4.element.NavigableString'>":

if child.name == 'dt' and child.string != None:

last_title = child.string.strip()

if child.name == 'dd':

#print last_title

if last_title == "Download":

for item in child.find_all("li"):

link = item.find("a")

format = item.find(property="dc:format")

linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),

"format": format.string.strip(), "size": format.next_sibling.string.strip()}

doc['metadata'][last_title] = linkobj

else:

atags = child.find_all('a')

if len(atags) < 2:

[s.extract() for s in child(class_='viewAll')]

doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()

else:

doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]

print doc['metadata']

sys.exit("ggg")