pdf scrapers
[disclosr.git] / documents / genericScrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import sys
import os
 
sys.path.insert(0, os.path.join(os.path.dirname(__file__) or '.', '../'))
import scrape
from bs4 import BeautifulSoup
from time import mktime
import feedparser
import abc
import unicodedata
import re
import dateutil
from dateutil.parser import *
from datetime import *
import codecs
 
import difflib
 
from StringIO import StringIO
 
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
 
 
class GenericDisclogScraper(object):
    __metaclass__ = abc.ABCMeta
    agencyID = None
    disclogURL = None
 
    def remove_control_chars(self, input):
        return "".join([i for i in input if ord(i) in range(32, 127)])
 
    def getAgencyID(self):
        """ disclosr agency id """
        if self.agencyID is None:
            self.agencyID = os.path.basename(sys.argv[0]).replace(".py", "")
        return self.agencyID
 
    def getURL(self):
        """ disclog URL"""
        if self.disclogURL is None:
            agency = scrape.agencydb.get(self.getAgencyID())
            self.disclogURL = agency['FOIDocumentsURL']
        return self.disclogURL
 
    @abc.abstractmethod
    def doScrape(self):
        """ do the scraping """
        return
 
 
class GenericHTMLDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, rcontent) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        content = rcontent
        dochash = scrape.mkhash(content)
        doc = foidocsdb.get(dochash)
        if doc is None:
            print "saving " + dochash
            description = "This log may have updated but as it was not in a table last time we viewed it, we cannot extract what has changed. Please refer to the agency's website Disclosure Log to see the most recent entries"
            last_attach = scrape.getLastAttachment(scrape.docsdb, self.getURL())
            if last_attach != None:
                html_diff = difflib.HtmlDiff()
                diff = html_diff.make_table(last_attach.read().split('\n'),
                    content.split('\n'))
            edate = date.today().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
            , 'url': self.getURL(), 'docID': dochash,
            "date": edate, "title": "Disclosure Log Updated", 
            "description":  self.remove_control_chars(description), "diff": self.remove_control_chars(diff)}
            foidocsdb.save(doc)
        else:
            print "already saved"
 
 
class GenericPDFDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        laparams = LAParams()
        rsrcmgr = PDFResourceManager(caching=True)
        outfp = StringIO()
        device = TextConverter(rsrcmgr, outfp, codec='utf-8',
            laparams=laparams)
        fp = StringIO()
        fp.write(content)
 
        process_pdf(rsrcmgr, device, fp, set(), caching=True,
            check_extractable=True)
        description = outfp.getvalue()
        fp.close()
        device.close()
        outfp.close()
        dochash = scrape.mkhash(description)
        doc = foidocsdb.get(dochash)
        if doc is None:
            print "saving " + dochash
            edate = date.today().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
            , 'url': self.getURL(), 'docID': dochash,
            "date": edate, "title": "Disclosure Log Updated", "description": self.remove_control_chars(description)}
            foidocsdb.save(doc)
        else:
            print "already saved"
 
 
class GenericDOCXDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb
            , self.getURL(), "foidocuments", self.getAgencyID())
        mydoc = zipfile.ZipFile(file)
        xmlcontent = mydoc.read('word/document.xml')
        document = etree.fromstring(xmlcontent)
        ## Fetch all the text out of the document we just created
        paratextlist = getdocumenttext(document)
        # Make explicit unicode version
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
            ## Print our documnts test with two newlines under each paragraph
        description = '\n\n'.join(newparatextlist).strip(' \t\n\r')
        dochash = scrape.mkhash(description)
        doc = foidocsdb.get(dochash)
 
        if doc is None:
            print "saving " + dochash
            edate = time().strftime("%Y-%m-%d")
            doc = {'_id': dochash, 'agencyID': self.getAgencyID()
                , 'url': self.getURL(), 'docID': dochash,
                   "date": edate, "title": "Disclosure Log Updated", "description": description}
            foidocsdb.save(doc)
        else:
            print "already saved"
 
 
class GenericRSSDisclogScraper(GenericDisclogScraper):
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        feed = feedparser.parse(content)
        for entry in feed.entries:
            #print entry
            print entry.id
            dochash = scrape.mkhash(entry.id)
            doc = foidocsdb.get(dochash)
            #print doc
            if doc is None:
                print "saving " + dochash
                edate = datetime.fromtimestamp(
                    mktime(entry.published_parsed)).strftime("%Y-%m-%d")
                doc = {'_id': dochash, 'agencyID': self.getAgencyID(),
                       'url': entry.link, 'docID': entry.id,
                       "date": edate, "title": entry.title}
                self.getDescription(entry, entry, doc)
                foidocsdb.save(doc)
            else:
                print "already saved"
 
        def getDescription(self, content, entry, doc):
            """ get description from rss entry"""
            doc.update({'description': content.summary})
 
        return
 
 
class GenericOAICDisclogScraper(GenericDisclogScraper):
    __metaclass__ = abc.ABCMeta
 
    @abc.abstractmethod
    def getColumns(self, columns):
        """ rearranges columns if required """
        return
 
    def getColumnCount(self):
        return 5
 
    def getDescription(self, content, entry, doc):
        """ get description from rss entry"""
        descriptiontxt = ""
        for string in content.stripped_strings:
            descriptiontxt = descriptiontxt + " \n" + string
        doc.update({'description': descriptiontxt})
 
    def getTitle(self, content, entry, doc):
        doc.update({'title': (''.join(content.stripped_strings))})
 
    def getTable(self, soup):
        return soup.table
 
    def getRows(self, table):
        return table.find_all('tr')
 
    def getDate(self, content, entry, doc):
        strdate = ''.join(content.stripped_strings).strip()
        (a, b, c) = strdate.partition("(")
        strdate = self.remove_control_chars(a.replace("Octber", "October").replace("Janrurary", "January").replace("1012","2012"))
        print strdate
        try:
                edate = parse(strdate, dayfirst=True, fuzzy=True).strftime("%Y-%m-%d")
        except ValueError:
                print >> sys.stderr, "ERROR date invalid %s " % strdate
                print >> sys.stderr, "ERROR date originally %s " % ''.join(content.stripped_strings).strip()
                edate = date.today().strftime("%Y-%m-%d")  
        print edate
        doc.update({'date': edate})
        return
 
    def getLinks(self, content, entry, doc):
        links = []
        for atag in entry.find_all("a"):
            if atag.has_key('href'):
                links.append(scrape.fullurl(content, atag['href']))
        if links != []:
            doc.update({'links': links})
        return
 
    def doScrape(self):
        foidocsdb = scrape.couch['disclosr-foidocuments']
        (url, mime_type, content) = scrape.fetchURL(scrape.docsdb,
            self.getURL(), "foidocuments", self.getAgencyID())
        if content is not None:
            if mime_type == "text/html" or mime_type == "application/xhtml+xml" or mime_type == "application/xml":
            # http://www.crummy.com/software/BeautifulSoup/documentation.html
                print "parsing"
                soup = BeautifulSoup(content)
                table = self.getTable(soup)
                for row in self.getRows(table):
                    columns = row.find_all('td')
                    if len(columns) is self.getColumnCount():
                        (id, date, title,
                         description, notes) = self.getColumns(columns)
                        print self.remove_control_chars(
                            ''.join(id.stripped_strings))
                        if id.string is None:
                            dochash = scrape.mkhash(
                                self.remove_control_chars(
                                    url + (''.join(date.stripped_strings))))
                        else:
                            dochash = scrape.mkhash(
                                self.remove_control_chars(
                                    url + (''.join(id.stripped_strings))))
                        doc = foidocsdb.get(dochash)
 
                        if doc is None:
                            print "saving " + dochash
                            doc = {'_id': dochash,
                                   'agencyID': self.getAgencyID(),
                                   'url': self.getURL(),
                                   'docID': (''.join(id.stripped_strings))}
                            self.getLinks(self.getURL(), row, doc)
                            self.getTitle(title, row, doc)
                            self.getDate(date, row, doc)
                            self.getDescription(description, row, doc)
                            if notes is not None:
                                doc.update({'notes': (
                                    ''.join(notes.stripped_strings))})
                            badtitles = ['-', 'Summary of FOI Request'
                                , 'FOI request(in summary form)'
                                , 'Summary of FOI request received by the ASC',
                                         'Summary of FOI request received by agency/minister',
                                         'Description of Documents Requested', 'FOI request',
                                         'Description of FOI Request', 'Summary of request', 'Description', 'Summary',
                                         'Summary of FOIrequest received by agency/minister',
                                         'Summary of FOI request received', 'Description of    FOI Request',
                                         "FOI request", 'Results 1 to 67 of 67']
                            if doc['title'] not in badtitles and 'description' in doc.keys() and doc['description'] != '':
                                print "saving"
                                foidocsdb.save(doc)
                        else:
                            print "already saved " + dochash
 
                    elif len(row.find_all('th')) is self.getColumnCount():
                        print "header row"
 
                    else:
                        print >> sys.stderr, "ERROR number of columns incorrect"
                        print row