scraper fixes
scraper fixes


Former-commit-id: 7dbe6fcea0360db2b17d5c5e4a34f632e3fb3e06

import sys import sys
import os import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import traceback import traceback
try: try:
import amonpy import amonpy
amonpy.config.address = 'http://amon_instance:port' amonpy.config.address = 'http://amon_instance:port'
amonpy.config.secret_key = 'the secret key from /etc/amon.conf' amonpy.config.secret_key = 'the secret key from /etc/amon.conf'
amon_available = True amon_available = True
except ImportError: except ImportError:
amon_available = False amon_available = False
   
class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper): class ScraperImplementation(genericScrapers.GenericPDFDisclogScraper):
   
def __init__(self): def __init__(self):
super(ScraperImplementation, self).__init__() super(ScraperImplementation, self).__init__()
   
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, print 'Subclass:', issubclass(ScraperImplementation,
genericScrapers.GenericPDFDisclogScraper) genericScrapers.GenericPDFDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), print 'Instance:', isinstance(ScraperImplementation(),
genericScrapers.GenericPDFDisclogScraper) genericScrapers.GenericPDFDisclogScraper)
try: try:
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
except Exception, err: except Exception, err:
sys.stderr.write('ERROR: %s\n' % str(err)) sys.stderr.write('ERROR: %s\n' % str(err))
print "Error Reason: ", err.__doc__ print "Error Reason: ", err.__doc__
print "Exception: ", err.__class__ print "Exception: ", err.__class__
print traceback.format_exc() print traceback.format_exc()
if amon_available: if amon_available:
data = { data = {
'exception_class': '', 'exception_class': '',
'url': '', 'url': '',
'backtrace': ['exception line ', 'another exception line'], 'backtrace': ['exception line ', 'another exception line'],
'enviroment': '', 'enviroment': '',
# In 'data' you can add request information, session variables - it's a recursive # In 'data' you can add request information, session variables - it's a recursive
# dictionary, so you can literally add everything important for your specific case # dictionary, so you can literally add everything important for your specific case
# The dictionary doesn't have a specified structure, the keys below are only example # The dictionary doesn't have a specified structure, the keys below are only example
'data': {'request': '', 'session': '', 'more': ''} 'data': {'request': '', 'session': '', 'more': ''}
   
} }
  #amonpy.exception(data)
amonpy.exception(data)  
pass pass
   
import sys,os import sys,os
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../')) sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import genericScrapers import genericScrapers
import scrape import scrape
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
   
#http://www.doughellmann.com/PyMOTW/abc/ #http://www.doughellmann.com/PyMOTW/abc/
class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper): class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
def getTable(self,soup):  
return soup.find(id = "content_div_50269").table  
def getColumns(self,columns): def getColumns(self,columns):
(id, date, title, description, notes) = columns (id, date, title, description, notes) = columns
return (id, date, title, description, notes) return (id, date, title, description, notes)
   
if __name__ == '__main__': if __name__ == '__main__':
print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper) print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper) print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
ScraperImplementation().doScrape() ScraperImplementation().doScrape()
   
  import sys,os
  sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
  import genericScrapers
  import scrape
  from bs4 import BeautifulSoup
 
  #http://www.doughellmann.com/PyMOTW/abc/
  class ScraperImplementation(genericScrapers.GenericOAICDisclogScraper):
  def getColumns(self,columns):
  (id, date, title, description, notes) = columns
  return (id, date, title, description, notes)
 
  if __name__ == '__main__':
  print 'Subclass:', issubclass(ScraperImplementation, genericScrapers.GenericOAICDisclogScraper)
  print 'Instance:', isinstance(ScraperImplementation(), genericScrapers.GenericOAICDisclogScraper)
  ScraperImplementation().doScrape()