beginning datagov scraper
[disclosr.git] / documents /
blob:a/documents/ -> blob:b/documents/
--- a/documents/
+++ b/documents/
@@ -1,8 +1,9 @@
 import sys, os
+import time
 import scrape
 from bs4 import BeautifulSoup
+from unidecode import unidecode
 listurl = ""
 (url, mime_type, datasetlisthtml) = scrape.fetchURL(scrape.docsdb,
@@ -15,34 +16,42 @@
             url, "data", "AGIMO")
         hash = scrape.mkhash(scrape.canonurl(url))
         doc = scrape.docsdb.get(hash)
-        if "metadata" not in doc.keys():
+        if "metadata" not in doc.keys() or True:
+            doc['type'] = "dataset"
             doc['metadata'] = {}
-        soup = BeautifulSoup(html)
-        for metatag in soup.find_all('meta'):
-            if metatag.has_key('name'):
-                doc['metadata'][metatag['name']] = metatag['content']
-        for list in soup.find_all('dl'):
-            last_title = ""
-            for child in list.children:
-                if str(type(child)) != "<class 'bs4.element.NavigableString'>":
-                    if == 'dt' and child.string != None:
-                        last_title = child.string.strip()
-                    if == 'dd':
-                        #print last_title
-                        if last_title == "Download":
-                            for item in child.find_all("li"):
-                                link = item.find("a")
-                                format = item.find(property="dc:format")
-                                linkobj = {"href":link['href'].replace("/bye?","").strip(), "name": link.string.strip(),
-                                    "format": format.string.strip(), "size": format.next_sibling.string.strip()}
-                                doc['metadata'][last_title] = linkobj
+            soup = BeautifulSoup(html)
+            for metatag in soup.find_all('meta'):
+                if metatag.has_key('name'):
+                    doc['metadata'][metatag['name']] = metatag['content']
+            for list in soup.find_all('dl'):
+                last_title = ""
+                for child in list.children:
+                    if str(type(child)) != "<class 'bs4.element.NavigableString'>":
+                        if == 'dt' and child.string != None:
+                            last_title = child.string.strip()
+                        if == 'dd':
+                            #print last_title
+                            if last_title == "Description":
+                                doc['metadata'][last_title] = unidecode(str(child)).encode('ascii', 'ignore')
+                            elif last_title == "Download":
+                                doc['metadata'][last_title] = {}
+                                for item in child.find_all("li"):
+                                    link = item.find("a")
+                                    format = item.find(property="dc:format")
+                                    linkobj = {"href":link['href'].replace("/bye?","").strip(),
+                                            "format": format.string.strip(), "size": format.next_sibling.string.strip()}
+                                    if link.string != None:
+                                        linkobj["name"] = link.string.strip()
+                                    doc['metadata'][last_title][] = linkobj
-                        else:
-                            atags = child.find_all('a')
-                            if len(atags) < 2:
-                                [s.extract() for s in child(class_='viewAll')]
-                                doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
-                                doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
-        print doc['metadata']
-        sys.exit("ggg")
+                                atags = child.find_all('a')
+                                if len(atags) < 2:
+                                    [s.extract() for s in child(class_='viewAll')]
+                                    doc['metadata'][last_title] = ''.join(child.stripped_strings).strip()
+                                else:
+                                    doc['metadata'][last_title] = [item.string.replace(",","").strip() for item in atags]
+            print doc['metadata']
+            #time.sleep(2)