Add start of metadata extract
Former-commit-id: 51210e2d4385a4942d9d7a380a4c16c16811f98b
|
#http://packages.python.org/CouchDB/client.html |
|
import couchdb |
|
from BeautifulSoup import BeautifulSoup |
|
|
|
couch = couchdb.Server('http://127.0.0.1:5984/') |
|
|
|
# select database |
|
docsdb = couch['disclosr-documents'] |
|
|
|
for row in docsdb.view('app/getMetadataExtractRequired'): |
|
print row.id |
|
html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read() |
|
metadata = [] |
|
# http://www.crummy.com/software/BeautifulSoup/documentation.html |
|
soup = BeautifulSoup(html) |
|
metatags = soup.meta |
|
for metatag in metatags: |
|
print metatag['name'] |
|
doc = docsdb.get(row.id) |
|
//doc['metadata'] = metadata |
|
//docsdb.save(doc) |
|
|