Add start of metadata extract
Former-commit-id: 51210e2d4385a4942d9d7a380a4c16c16811f98b
|
#http://packages.python.org/CouchDB/client.html |
|
import couchdb |
|
from BeautifulSoup import BeautifulSoup |
|
|
|
couch = couchdb.Server('http://127.0.0.1:5984/') |
|
|
|
# select database |
|
docsdb = couch['disclosr-documents'] |
|
|
|
for row in docsdb.view('app/getMetadataExtractRequired'): |
|
print row.id |
|
html = docsdb.get_attachment(row.id,row.value.iterkeys().next()).read() |
|
metadata = [] |
|
# http://www.crummy.com/software/BeautifulSoup/documentation.html |
|
soup = BeautifulSoup(html) |
|
metatags = soup.meta |
|
for metatag in metatags: |
|
print metatag['name'] |
|
doc = docsdb.get(row.id) |
|
//doc['metadata'] = metadata |
|
//docsdb.save(doc) |
|
|
|
<?php |
|
|
|
include_once('include/common.inc.php'); |
|
include_header(); |
|
|
|
echo "<table> |
|
<tr><th>name</th><th>webserver</th><th>accessiblity errors</th></tr>"; |
|
$agenciesdb = $server->get_db('disclosr-agencies'); |
|
$docsdb = $server->get_db('disclosr-documents'); |
|
try { |
|
$rows = $agenciesdb->get_view("app", "all", null, true)->rows; |
|
|
|
|
|
if ($rows) { |
|
foreach ($rows as $row) { |
|
|
|
echo "<tr><td>" . $row->value->name . "</td>"; |
|
if (isset($row->value->website)) { |
|
try { |
|
$website = $docsdb->get(md5($row->value->website)); |
|
$serverParts = explode(" ",$website->web_server); |
|
echo "<td>" . $serverParts[0] . "</td>"; |
|
echo "<td>" . $website->mime_type . "</td>"; |
|
} catch (SetteeRestClientException $e) { |
|
// setteErrorHandler($e); |
|
} |
|
} |
|
echo "</tr>"; |
|
} |
|
} |
|
} catch (SetteeRestClientException $e) { |
|
setteErrorHandler($e); |
|
} |
|
include_footer(); |
|
?> |