From: Maxious Date: Wed, 28 Mar 2012 23:13:02 +0000 Subject: Add start of metadata extract X-Git-Url: https://maxious.lambdacomplex.org/git/?p=disclosr.git&a=commitdiff&h=f7e996f7d620a6eb4f3ee6f45d98dc403e75d5fd --- Add start of metadata extract Former-commit-id: 51210e2d4385a4942d9d7a380a4c16c16811f98b --- --- /dev/null +++ b/admin/metadata.py @@ -1,1 +1,22 @@ +#http://packages.python.org/CouchDB/client.html +import couchdb +from BeautifulSoup import BeautifulSoup +couch = couchdb.Server('http://127.0.0.1:5984/') + +# select database +docsdb = couch['disclosr-documents'] + +for row in docsdb.view('app/getMetadataExtractRequired'): + print row.id + html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read() + metadata = [] + # http://www.crummy.com/software/BeautifulSoup/documentation.html + soup = BeautifulSoup(html) +metatags = soup.meta + for metatag in metatags: + print metatag['name'] + doc = docsdb.get(row.id) + //doc['metadata'] = metadata + //docsdb.save(doc) + --- /dev/null +++ b/webserver.php @@ -1,1 +1,35 @@ + + namewebserveraccessiblity errors"; +$agenciesdb = $server->get_db('disclosr-agencies'); +$docsdb = $server->get_db('disclosr-documents'); +try { + $rows = $agenciesdb->get_view("app", "all", null, true)->rows; + + + if ($rows) { + foreach ($rows as $row) { + + echo "" . $row->value->name . ""; + if (isset($row->value->website)) { + try { + $website = $docsdb->get(md5($row->value->website)); + $serverParts = explode(" ",$website->web_server); + echo "" . $serverParts[0] . ""; + echo "" . $website->mime_type . ""; + } catch (SetteeRestClientException $e) { + // setteErrorHandler($e); + } + } + echo ""; + } + } +} catch (SetteeRestClientException $e) { + setteErrorHandler($e); +} +include_footer(); +?>