beginnings of document scrapers mark 2
[disclosr.git] / documents / scrapers /
blob:a/documents/scrapers/ -> blob:b/documents/scrapers/
--- a/documents/scrapers/
+++ b/documents/scrapers/
@@ -1,1 +1,40 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import scrape
+foidocsdb = scrape.couch['disclosr-foidocuments']
+#RSS feed not detailed
+from bs4 import BeautifulSoup
+agencyID = "3cd40b1240e987cbcd3f0e67054ce259"
+(url,mime_type,content) = scrape.fetchURL(scrape.docsdb, "", "foidocuments", agencyID)
+if content != None:
+	if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+            #
+		soup = BeautifulSoup(content)
+		for row in soup.table.find_all('tr'):
+			columns = row.find_all('td')
+			if len(columns) == 5:
+				(id, date, description, title, notes) = columns
+				print id.string
+				hash = scrape.mkhash(url+id.string)
+				links = []
+				for atag in row.find_all("a"):
+ 					if atag.has_key('href'):
+						links.append(scrape.fullurl(url,atag['href']))
+				doc = foidocsdb.get(hash)
+				descriptiontxt = ""
+				for string in description.stripped_strings:
+					descriptiontxt = descriptiontxt + string
+    				if doc == None:
+					print "saving"
+	 				doc = {'_id': hash, 'agencyID': agencyID, 'url': url, "links": links, 'docID': id.string, "date": date.string, "description": descriptiontxt,"title": title.string,"notes": notes.string}
+				else:
+					print "already saved"
+			elif len(row.find_all('th')) == 5:
+				print "header row"
+			else:
+				print "ERROR number of columns incorrect"
+				print row