more scrapers
Former-commit-id: 0f8fae54a999065eb27885ebde8045a1d5a3c8b4
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -55,7 +55,7 @@
doc = foidocsdb.get(hash)
#print doc
if doc == None:
- print "saving"
+ print "saving "+ hash
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
"date": edate,"title": entry.title}
@@ -89,9 +89,17 @@
def getTable(self, soup):
return soup.table
def getDate(self, content, entry, doc):
- edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+ edate = parse(''.join(content.stripped_strings).strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
doc.update({'date': edate})
+ return
+ def getLinks(self, content, entry, doc):
+ links = []
+ for atag in entry.find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(content,atag['href']))
+ if links != []:
+ doc.update({'links': links})
return
def doScrape(self):
@@ -106,22 +114,17 @@
columns = row.find_all('td')
if len(columns) == self.getColumnCount():
(id, date, description, title, notes) = self.getColumns(columns)
- print id.string
+ print ''.join(id.stripped_strings)
if id.string == None:
- hash = scrape.mkhash(self.remove_control_chars(url+date.string))
+ hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
else:
- hash = scrape.mkhash(self.remove_control_chars(url+id.string))
- links = []
- for atag in row.find_all("a"):
- if atag.has_key('href'):
- links.append(scrape.fullurl(url,atag['href']))
+ hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
doc = foidocsdb.get(hash)
if doc == None:
- print "saving"
+ print "saving " +hash
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
- if links != []:
- doc.update({'links': links})
+ self.getLinks(self.getURL(),row,doc)
self.getTitle(title,row, doc)
self.getDate(date,row, doc)
self.getDescription(description,row, doc)
--- /dev/null
+++ b/documents/scrapers/53d2884f8afd026096a27bd5051ec50e.py
@@ -1,1 +1,39 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(class_ = "ms-rtestate-field").table
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, description, title, notes)
+
+ def getLinks(self, content, entry, doc):
+ link = None
+ links = []
+ for atag in entry.find_all('a'):
+ if atag.has_key('href'):
+ link = scrape.fullurl(self.getURL(),atag['href'])
+ (url,mime_type,htcontent) = scrape.fetchURL(scrape.docsdb, link, "foidocuments", self.getAgencyID(), False)
+ if htcontent != None:
+ if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type =="application/xml":
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html
+ soup = BeautifulSoup(htcontent)
+ for atag in soup.find(class_ = "article-content").find_all('a'):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(link,atag['href']))
+
+ if links != []:
+ doc.update({'links': links})
+ doc.update({'url': link})
+ return
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/7c6adc1d41cf029bf1a0959e5156477a.txt
--- /dev/null
+++ b/documents/scrapers/8ef0e5802f99800f514b3a148e013b75.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 4;
+ def getTable(self,soup):
+ return soup.find(class_ = "content").table
+ def getColumns(self,columns):
+ (id, date, title, description) = columns
+ return (id, date, description, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/d72744fb1e5d6e87af9a5ea16cc27fa5.txt