gitphp 0.2.9.1 :: disclosr.git/blob

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48	import sys, os import scrape from bs4 import BeautifulSoup listurl = "http://data.gov.au/data/" (url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb, listurl, "data", "AGIMO") soup = BeautifulSoup(datasetlisthtml) for atag in soup.find_all(class_='result-title'): if atag.has_key('href'): url = scrape.fullurl(listurl, atag['href']) (url, mime_type, html) = scrape.fetchURL(scrape.docsdb, url, "data", "AGIMO") hash = scrape.mkhash(scrape.canonurl(url)) doc = scrape.docsdb.get(hash) if "metadata" not in doc.keys(): doc['metadata'] = {} soup = BeautifulSoup(html) for metatag in soup.find_all('meta'): if metatag.has_key('name'): doc['metadata'][metatag['name']] = metatag['content'] for list in soup.find_all('dl'): last_title = "" for child in list.children: if str(type(child)) != "<class 'bs4.element.NavigableString'>": if child.name == 'dt' and child.string != None: last_title = child.string.strip() if child.name == 'dd': #print last_title if last_title == "Download": for item in child.find_all("li"): link = item.find("a") format = item.find(property="dc:format") linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(), "format": format.string.strip(), "size": format.next_sibling.string.strip()} doc['metadata'][last_title] = linkobj else: atags = child.find_all('a') if len(atags) < 2: [s.extract() for s in child(class_='viewAll')] doc['metadata'][last_title] = ''.join(child.stripped_strings).strip() else: doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags] print doc['metadata'] sys.exit("ggg")

import sys, os
 
import scrape
from bs4 import BeautifulSoup
 
 
listurl = "http://data.gov.au/data/"
(url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
    listurl, "data", "AGIMO")
soup = BeautifulSoup(datasetlisthtml)
for atag in soup.find_all(class_='result-title'):
    if atag.has_key('href'):
        url = scrape.fullurl(listurl, atag['href'])
        (url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
            url, "data", "AGIMO")
        hash = scrape.mkhash(scrape.canonurl(url))
        doc = scrape.docsdb.get(hash)
        if "metadata" not in doc.keys():
            doc['metadata'] = {}
        soup = BeautifulSoup(html)
        for metatag in soup.find_all('meta'):
            if metatag.has_key('name'):
                doc['metadata'][metatag['name']] = metatag['content']
        for list in soup.find_all('dl'):
            last_title = ""
            for child in list.children:
                if str(type(child)) != "<class 'bs4.element.NavigableString'>":
                    if child.name == 'dt' and child.string != None:
                        last_title = child.string.strip()
                    if child.name == 'dd':
                        #print last_title
                        if last_title == "Download":
                            for item in child.find_all("li"):
                                link = item.find("a")
                                format = item.find(property="dc:format")
                                linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
                                    "format": format.string.strip(), "size": format.next_sibling.string.strip()}
                                doc['metadata'][last_title] = linkobj
 
                        else:
                            atags = child.find_all('a')
                            if len(atags) < 2:
                                [s.extract() for s in child(class_='viewAll')]
                                doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
                            else:
                                doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
        print doc['metadata']
        sys.exit("ggg")