more scrapers
Former-commit-id: 96bfae466ef6496e0bc9469d556f599c5e0e3d92
--- a/.gitmodules
+++ b/.gitmodules
@@ -31,4 +31,7 @@
[submodule "documents/lib/parsedatetime"]
path = documents/lib/parsedatetime
url = git://github.com/bear/parsedatetime.git
+[submodule "lib/FeedWriter"]
+ path = lib/FeedWriter
+ url = https://github.com/mibe/FeedWriter
--- /dev/null
+++ b/documents/.gitignore
@@ -1,1 +1,2 @@
+*.pyc
--- a/documents/disclogsList.php
+++ b/documents/disclogsList.php
@@ -8,6 +8,11 @@
<tr><th>Agency Name</th><th>Disclosure Log URL recorded?</th><th>Do we monitor this URL?</th></tr>";
$agenciesdb = $server->get_db('disclosr-agencies');
$docsdb = $server->get_db('disclosr-documents');
+$agencies = 0;
+$disclogs = 0;
+$red = 0;
+$green = 0;
+$orange = 0;
try {
$rows = $agenciesdb->get_view("app", "byCanonicalName", null, true)->rows;
@@ -19,26 +24,30 @@
if ($ENV == "DEV")
echo "<br>(" . $row->id . ")";
echo "</td>\n";
-
+$agencies++;
echo "<td>";
if (isset($row->value->FOIDocumentsURL)) {
+ $disclogs++;
echo '<a href="' . $row->value->FOIDocumentsURL . '">'
. $row->value->FOIDocumentsURL . '</a>';
if ($ENV == "DEV")
echo '<br><small>(<a href="viewDocument.php?hash=' . md5($row->value->FOIDocumentsURL) . '">'
. 'view local copy</a>)</small>';
} else {
- echo "<font color='red'>✘</font>";
+ echo "<font color='red'><abbr title='No'>✘</abbr></font>";
}
echo "</td>\n<td>";
if (isset($row->value->FOIDocumentsURL)) {
if (file_exists("./scrapers/" . $row->id . '.py')) {
- echo "<font color='green'>✔</font>";
+ echo "<font color='green'><abbr title='Yes'>✔</abbr></font>";
+ $green++;
} else if (file_exists("./scrapers/" . $row->id . '.txt')) {
- echo "<font color='blue'><b>▬</b></font>";
+ echo "<font color='orange'><abbr title='Work in progress'><b>▬</b></abbr></font>";
+ $orange++;
} else {
- echo "<font color='red'>✘</font>";
+ echo "<font color='red'><abbr title='No'>✘</abbr></font>";
+ $red++;
}
}
echo "</td></tr>\n";
@@ -48,5 +57,9 @@
setteErrorHandler($e);
}
echo "</table>";
+echo $agencies." agencies, ".round(($disclogs/$agencies)*100)."% with disclosure logs; "
+.round(($green/$disclogs)*100)."% logs with scrapers ".round(($red/$disclogs)*100)."% logs without scrapers ".round(($orange/$disclogs)*100)."% logs Work-In-Progress scrapers ";
+
include_footer_documents();
?>
+
--- a/documents/genericScrapers.py
+++ b/documents/genericScrapers.py
@@ -55,7 +55,7 @@
doc = foidocsdb.get(hash)
#print doc
if doc == None:
- print "saving"
+ print "saving "+ hash
edate = datetime.fromtimestamp(mktime( entry.published_parsed)).strftime("%Y-%m-%d")
doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': entry.link, 'docID': entry.id,
"date": edate,"title": entry.title}
@@ -84,14 +84,27 @@
doc.update({'description': descriptiontxt})
return
def getTitle(self, content, entry, doc):
- doc.update({'title': content.string})
+ doc.update({'title': (''.join(content.stripped_strings))})
return
def getTable(self, soup):
return soup.table
+ def getRows(self, table):
+ return table.find_all('tr')
def getDate(self, content, entry, doc):
- edate = parse(content.string.strip(), dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+ date = ''.join(content.stripped_strings).strip()
+ date = date.replace("Octber","October")
+ print date
+ edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
print edate
doc.update({'date': edate})
+ return
+ def getLinks(self, content, entry, doc):
+ links = []
+ for atag in entry.find_all("a"):
+ if atag.has_key('href'):
+ links.append(scrape.fullurl(content,atag['href']))
+ if links != []:
+ doc.update({'links': links})
return
def doScrape(self):
@@ -102,31 +115,26 @@
# http://www.crummy.com/software/BeautifulSoup/documentation.html
soup = BeautifulSoup(content)
table = self.getTable(soup)
- for row in table.find_all('tr'):
+ for row in self.getRows(table):
columns = row.find_all('td')
if len(columns) == self.getColumnCount():
- (id, date, description, title, notes) = self.getColumns(columns)
- print id.string
+ (id, date, title, description, notes) = self.getColumns(columns)
+ print ''.join(id.stripped_strings)
if id.string == None:
- hash = scrape.mkhash(self.remove_control_chars(url+date.string))
+ hash = scrape.mkhash(self.remove_control_chars(url+(''.join(date.stripped_strings))))
else:
- hash = scrape.mkhash(self.remove_control_chars(url+id.string))
- links = []
- for atag in row.find_all("a"):
- if atag.has_key('href'):
- links.append(scrape.fullurl(url,atag['href']))
+ hash = scrape.mkhash(self.remove_control_chars(url+(''.join(id.stripped_strings))))
doc = foidocsdb.get(hash)
if doc == None:
- print "saving"
- doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string}
- if links != []:
- doc.update({'links': links})
+ print "saving " +hash
+ doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': (''.join(id.stripped_strings))}
+ self.getLinks(self.getURL(),row,doc)
self.getTitle(title,row, doc)
self.getDate(date,row, doc)
self.getDescription(description,row, doc)
if notes != None:
- doc.update({ 'notes': notes.string})
+ doc.update({ 'notes': (''.join(notes.stripped_strings))})
foidocsdb.save(doc)
else:
print "already saved "+hash
--- a/documents/index.php
+++ b/documents/index.php
@@ -20,7 +20,7 @@
if ($rows) {
foreach ($rows as $row) {
-displayLogEntry($row,$idtoname);
+echo displayLogEntry($row,$idtoname);
}
}
} catch (SetteeRestClientException $e) {
--- a/documents/robots.txt
+++ b/documents/robots.txt
@@ -2,4 +2,5 @@
# http://code.google.com/web/controlcrawlindex/
User-agent: *
-
+Disallow: /admin/
+Sitemap: http://disclosurelo.gs/sitemap.xml.php
--- a/documents/rss.xml.php
+++ b/documents/rss.xml.php
@@ -3,28 +3,38 @@
// Agency X updated Y, new files, diff of plain text/link text,
// feed for just one agency or all
// This is a minimum example of using the Universal Feed Generator Class
-include("lib/FeedWriter.php");
+include("../lib/FeedWriter/FeedTypes.php");
+include_once('../include/common.inc.php');
//Creating an instance of FeedWriter class.
-$TestFeed = new FeedWriter(RSS2);
+$TestFeed = new RSS2FeedWriter();
//Setting the channel elements
//Use wrapper functions for common channelelements
$TestFeed->setTitle('Last Modified - All');
-$TestFeed->setLink('http://disclosr.lambdacomplex.org/rss.xml.php');
+$TestFeed->setLink('http://disclosurelo.gs/rss.xml.php');
$TestFeed->setDescription('This is test of creating a RSS 2.0 feed Universal Feed Writer');
+ $TestFeed->setChannelElement('language', 'en-us');
+ $TestFeed->setChannelElement('pubDate', date(DATE_RSS, time()));
//Retriving informations from database
-$rows = $db->get_view("app", "byLastModified")->rows;
+$idtoname = Array();
+$agenciesdb = $server->get_db('disclosr-agencies');
+foreach ($agenciesdb->get_view("app", "byCanonicalName")->rows as $row) {
+ $idtoname[$row->id] = trim($row->value->name);
+}
+$foidocsdb = $server->get_db('disclosr-foidocuments');
+$rows = $foidocsdb->get_view("app", "byDate", Array('9999-99-99','0000-00-00'), true)->rows;
//print_r($rows);
foreach ($rows as $row) {
//Create an empty FeedItem
$newItem = $TestFeed->createNewItem();
//Add elements to the feed item
- $newItem->setTitle($row['name']);
- $newItem->setLink($row['id']);
- $newItem->setDate(date("c", $row['metadata']['lastModified']));
- $newItem->setDescription($row['name']);
+ $newItem->setTitle($row->value->title);
+ $newItem->setLink("view.php?id=".$row->value->docID);
+ $newItem->setDate(date("c", strtotime($row->value->date)));
+ $newItem->setDescription(displayLogEntry($row,$idtoname));
+ $newItem->addElement('guid', $row->value->_id,array('isPermaLink'=>'true'));
//Now add the feed item
$TestFeed->addItem($newItem);
}
//OK. Everything is done. Now genarate the feed.
-$TestFeed->genarateFeed();
+$TestFeed->generateFeed();
?>
--- a/documents/scrape.py
+++ b/documents/scrape.py
@@ -204,12 +204,12 @@
scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
if key == 'website' and False:
scrapeAndStore(docsdb, agency[key],0,key,agency['_id'])
+ agency['metadata']['lastScraped'] = time.time()
if key.endswith('URL') and False:
print key
depth = 1
if 'scrapeDepth' in agency.keys():
depth = agency['scrapeDepth']
scrapeAndStore(docsdb, agency[key],depth,key,agency['_id'])
- agency['metadata']['lastScraped'] = time.time()
agencydb.save(agency)
--- /dev/null
+++ b/documents/scrapers/00a294de663db69062ca09aede7c0487.txt
@@ -1,1 +1,2 @@
+multipage
--- /dev/null
+++ b/documents/scrapers/0324e4b1654fd6dd651307abcef67094.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 6
+ def getColumns(self,columns):
+ (id, date, title, description, notes,link) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/0603dfcc930a791efaa64f31ae5fceda.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id="body-content").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/0e46f8bd1414b1fdd4f0543d54a97500.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "maincontentcontainer").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (date, disclogdate, title, description, notes) = columns
+ return (date, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1097fa8afdcf5db89d212d0979226667.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/157cbe654bdaa0a48e6650152ae34489.py
+++ b/documents/scrapers/157cbe654bdaa0a48e6650152ae34489.py
@@ -10,7 +10,7 @@
return 5
def getColumns(self,columns):
(id, date, title, description, notes) = columns
- return (id, date, description, title, notes)
+ return (id, date, title, description, notes)
def getTable(self,soup):
return soup.find_all('table')[4]
--- /dev/null
+++ b/documents/scrapers/1803322b27286950cab0c543168b5f21.txt
@@ -1,1 +1,2 @@
+multipage log
--- /dev/null
+++ b/documents/scrapers/1ad74ca88932f90f0b92b69387171441.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/1fda9544d2a3fa4cd92aec4b206a6763.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(_class = "article-content").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, title, date, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/24bd71114d3975ed9a63ad29624c62c9.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "inner_content")
+ def getColumnCount(self):
+ return 2
+ def getColumns(self,columns):
+ (date, title) = columns
+ return (date, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/269680be088f3d8e663251655f3825b4.py
@@ -1,1 +1,17 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumns(self,columns):
+ (date, id, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/31685505438d393f45a90f442b8fa27f.txt
@@ -1,1 +1,2 @@
+pdf
--- /dev/null
+++ b/documents/scrapers/31b7c75cf484747b6b120680bddd33b0.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 2
+ def getColumns(self,columns):
+ (title, date) = columns
+ return (date, date, title, title, None)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/329fccdca068b78ab7edd550e2957398.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 7
+ def getTable(self,soup):
+ return soup.find(class_ = "foi-disclosure")
+ def getColumns(self,columns):
+ (disclogid, id, date, title, link, removedate, notes) = columns
+ return (id, date, title, title, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/38ca99d2790975a40dde3fae41dbdc3d.py
@@ -1,1 +1,32 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import dateutil
+from dateutil.parser import *
+from datetime import *
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+ def getTitle(self, content, entry, doc):
+ i = 0
+ title = ""
+ for string in content.stripped_strings:
+ if i < 2:
+ title = title + string
+ i = i+1
+ doc.update({'title': title})
+ print title
+ return
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/3b54190e3f409380e109fae29e1917aa.py
@@ -1,1 +1,19 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getColumnCount(self):
+ return 7
+ def getColumns(self,columns):
+ (id, date, title, description, link, deldate,notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- a/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py
+++ b/documents/scrapers/3cd40b1240e987cbcd3f0e67054ce259.py
@@ -7,7 +7,7 @@
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getColumns(self,columns):
(id, date, description, title, notes) = columns
- return (id, date, description, title, notes)
+ return (id, date, title, description, notes)
if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
--- /dev/null
+++ b/documents/scrapers/3d426eb8c85c8f04b814eee597efd866.py
@@ -1,1 +1,21 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(id = "primary").table
+ def getColumnCount(self):
+ return 5
+ def getColumns(self,columns):
+ (id, date, title, description, notes) = columns
+ return (id, date, title, description, notes)
+
+if __name__ == '__main__':
+ print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
+ print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
+ ScraperImplementation().doScrape()
+
--- /dev/null
+++ b/documents/scrapers/3d5871a44abbbc81ef5b3a420070755d.py
@@ -1,1 +1,47 @@
+import sys,os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
+import genericScrapers
+import scrape
+from bs4 import BeautifulSoup
+import dateutil
+from dateutil.parser import *
+from datetime import *
+#http://www.doughellmann.com/PyMOTW/abc/
+class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
+ def getTable(self,soup):
+ return soup.find(class_ = "inner-column").table
+ def getRows(self,table):
+ return table.tbody.find_all('tr',recursive=False)
+ def getColumnCount(self):
+ return 3
+ def getColumns(self,columns):
+ (date, title, description) = columns
+ return (date, date, title, description, None)
+ def getDate(self, content, entry, doc):
+ i = 0
+ date = ""
+ for string in content.stripped_strings:
+ if i ==1:
+ date = string
+ i = i+1
+ edate = parse(date, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
+ print edate
+ doc.update({'date': edate})
+ return
+ def getTitle(self, content, entry, doc):
+ i = 0
+ title = ""
+ for string in