From: Alex Sadleir <maxious@lambdacomplex.org>
Date: Sun, 17 Feb 2013 04:28:11 +0000
Subject: finished gazette parser
X-Git-Url: http://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=b603f1a5405e2194c8795aaaf1c0bd663e21e238
---
finished gazette parser


Former-commit-id: 65e9b38b538386e7cab79cc166878d1b19090cb6
---


--- a/documents/gazette.py
+++ b/documents/gazette.py
@@ -5,44 +5,53 @@
 
 from unidecode import unidecode
 
-listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=3950"
-(url, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
-    listurl, "gazette", "AGD")
-for line in listhtml.split('\n'):
-    soup = BeautifulSoup(line)
-    #print line
-    for row in soup.find_all('tr'):
-        print line
-        if row.has_key('valign'):
-            i = 0
-            date = ""
-            id = ""
-            type = ""
-            description = ""
-            name = ""
-            url = ""
-            for col in soup.find_all('td'):
-                #print ''.join(col.stripped_strings)
-                if i == 0:
-                    date = ''.join(col.stripped_strings)
-                if i == 1:
-                    id = ''.join(col.stripped_strings)
-                if i == 2:
-                    type = ''.join(col.stripped_strings)
-                if i == 3:
-                    description = ''.join(col.stripped_strings)
-                    for link in col.findAll('a'):
-                        if link.has_key("href"):
-                            url = link['href']
-                            name = ''.join(link.stripped_strings)
-                            print [date,id,type,description,name, url]
-                i = i +1;
-                #url = scrape.fullurl(listurl, atag['href'])
-                #(url, mime_type, html) = scrape.fetchURL(scrape.docsdb,
-                #    url, "data", "AGIMO")
-                #hash = scrape.mkhash(scrape.canonurl(url))
-                #doc = scrape.docsdb.get(hash)
-                #print doc['metadata']
-                #scrape.docsdb.save(doc)
-                #time.sleep(2)
+items = 3950
+items = 1
+while True:
+    print str(items) + " (" +str(items/25) +" screens to go)"
+    listurl = "http://gazettes.ag.gov.au/portal/govgazonline.nsf/publications?OpenView&Start=" + str(items)
+    (listurl, mime_type, listhtml) = scrape.fetchURL(scrape.docsdb,
+        listurl, "gazette", "AGD", False)
+    for line in listhtml.split('\n'):
+        soup = BeautifulSoup(line)
+        #print line
+        for row in soup.find_all('tr'):
+            print line
+            if row.has_key('valign'):
+                i = 0
+                date = ""
+                id = ""
+                type = ""
+                description = ""
+                name = ""
+                url = ""
+                for col in soup.find_all('td'):
+                    #print ''.join(col.stripped_strings)
+                    if i == 0:
+                        date = ''.join(col.stripped_strings)
+                    if i == 1:
+                        id = ''.join(col.stripped_strings)
+                    if i == 2:
+                        type = ''.join(col.stripped_strings)
+                    if i == 3:
+                        description = ''.join(col.stripped_strings)
+                        for link in col.findAll('a'):
+                            if link.has_key("href"):
+                                url = link['href']
+                                name = ''.join(link.stripped_strings)
+                                print str(items) + " (" +str(items/25) +" screens to go)"
+                                print [date, id, type, description, name, url]
+                                itemurl = scrape.fullurl(listurl, url)
+                                (itemurl, mime_type, html) = scrape.fetchURL(scrape.docsdb,
+                                    itemurl, "gazette", "AGD", False)
+                                hash = scrape.mkhash(scrape.canonurl(itemurl))
+                                doc = scrape.docsdb.get(hash)
+                                doc['metadata'] = {"date": date, "date": id, "type":type, "description":description,"name": name,"url": url}
+                                scrape.docsdb.save(doc)
+                                #time.sleep(2)
+                    i = i + 1;
 
+    items = items - 25
+    if items <= 0:
+        break
+

--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -13,8 +13,8 @@
 import socket
 
 #couch = couchdb.Server('http://192.168.1.148:5984/')
-#couch = couchdb.Server('http://192.168.1.113:5984/')
-couch = couchdb.Server('http://127.0.0.1:5984/')
+couch = couchdb.Server('http://192.168.1.113:5984/')
+#couch = couchdb.Server('http://127.0.0.1:5984/')
 
 
 def mkhash(input):