--- a/documents/genericScrapers.py +++ b/documents/genericScrapers.py @@ -82,8 +82,21 @@ descriptiontxt = descriptiontxt + " \n" + string doc.update({'description': descriptiontxt}) return + def getTitle(self, content, entry, doc): + doc.update({'title': content.string}) + return def getTable(self, soup): return soup.table + def getDate(self, content, entry, doc): + dtresult = cal.parseDateText(content.string) + if len(dtresult) == 2: + (dtdate,dtr) = dtresult + edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) + else: + edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") + print edate + doc.update({'date': edate}) + return def doScrape(self): cal = pdt.Calendar() @@ -111,19 +124,13 @@ if doc == None: print "saving" - dtresult = cal.parseDateText(date.string) - if len(dtresult) == 2: - (dtdate,dtr) = dtresult - print dtdate - edate = ""+str(dtdate[0])+'-'+str(dtdate[1])+'-'+str(dtdate[2]) - else: - edate = datetime.strptime(date.string.strip(), "%d %B %Y").strftime("%Y-%m-%d") - doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string, - "date": edate,"title": title.string} + doc = {'_id': hash, 'agencyID': self.getAgencyID(), 'url': self.getURL(), 'docID': id.string} if links != []: doc.update({'links': links}) + self.getTitle(title,row, doc) + self.getDate(date,row, doc) self.getDescription(description,row, doc) - if notes != None: + if notes != None: doc.update({ 'notes': notes.string}) foidocsdb.save(doc) else: