From: Alex Sadleir Date: Sat, 29 Jun 2013 14:28:33 +0000 Subject: update overviewer and neo4j exports X-Git-Url: https://maxious.lambdacomplex.org/git/?p=contractdashboard.git&a=commitdiff&h=bea0e42061a399e4b37434c24f88fbeb51299fe4 --- update overviewer and neo4j exports --- --- a/admin/import.php +++ b/admin/import.php @@ -86,8 +86,7 @@ "Agency Postcode" => "contactPostcode", "" => "" ); - - $headers; + while (($data = fgetcsv($handle, 1000, "\t")) !== false) { $num = count($data); @@ -172,7 +171,7 @@ } ksort($files); foreach ($files as $date => $fname) { - echo "$fname " . filesize($path . $fname) . " " . $date . "
"; + echo "$fname " . filesize($path . $fname) . " " . $date . "
"; } } else { $success = 0; --- /dev/null +++ b/admin/importdatagov.php @@ -1,1 +1,216 @@ - + $f) { + $contractNoticeInsertQ.= ($key == 0 ? "" : ", ") . "?"; +} +$contractNoticeInsertQ.= ");"; +$contractNoticeInsertQ = $conn->prepare($contractNoticeInsertQ); + +function processFile($fpath) { + global $conn, $contractNoticeFields, $contractNoticeInsertQ; + $row = 1; + $success = 0; + ini_set('auto_detect_line_endings',TRUE); + $handle = fopen($fpath, "r"); + //"t" mode string translates windows line breaks to unix + $datamapping0712 = array( + + "Agency Name" => "agencyName", + "Parent Contract ID" => "parentCN", + "Contract ID" => "CNID", + "Publish Date" => "publishDate", + "Amendment Date" => "amendDate", + "Start Date" => "contractStart", + "End Date" => "contractEnd", + "Value" => "value", + "Description" => "description", + "Agency Ref ID" => "agencyID", + "UNSPSC Code" => "categoryUNSPSC", + "Title" => "category", + "Procurement Method" => "procurementMethod", + "ATM ID" => "atmID", + "SON ID" => "SONID", + "Confidentiality Contract Flag" => "confidentialityContract", + "Confidentiality Contract Reason" => "confidentialityContractReason", + "Confidentiality Outputs Flag" => "confidentialityOutputs", + "Confidentiality Outputs Reason" => "confidentialityOutputsReason", + "Consultancy Flag" => "consultancy", + "Consultancy Reason" => "consultancyReason", + "Amendment Reason" => "amendmentReason", + "Supplier Name" => "supplierName", + "Supplier Address" => "supplierAddress", + "Supplier Suburb" => "supplierCity", + "Supplier Postcode" => "supplierPostcode", + "Supplier Country" => "supplierCountry", + "Supplier ABN Exempt" => "supplierABNExempt", + "ABN" => "supplierABN", + "Contact Name" => "", + "Contact Phone" => "", + "Branch" => "contactBranch", + "Division" => "contactDivision", + "Office Postcode" => "contactPostcode", + + ); + + + while (($data = fgetcsv($handle, 10000)) !== false) { + //print_r($data); + $num = count($data); + if ($row == 1) { + $headers = $data; + } elseif ($row > 1) { + if ($num > count($datamapping0712)) { + die("Error in data import; data mapping fields out of bounds or changed $num > ".count($datamapping0712)."
" . $fpath . print_r($data)); + } + $contractNoticeInsert = Array(); + $supplierInsert = Array(); + $agencyInsert = Array(); + $contractNoticeInsert[] = $fpath; + $keys = array_keys($datamapping0712); + for ($c = 0; $c < $num; $c++) { + $data[$c] = trim($data[$c], "="); + $data[$c] = trim($data[$c], "\""); + if (in_array(($datamapping0712[$headers[$c]]), $contractNoticeFields)) { + if (($datamapping0712[$headers[$c]]) == "parentCN" || ($datamapping0712[$headers[$c]]) == "CNID") { + $data[$c] = substr($data[$c], 2); // take off the "CN" prefix + if ($data[$c] > 0 && $data[$c] != '0') { + $contractNoticeInsert[] = $data[$c]; + } else { + $contractNoticeInsert[] = null; + } + } elseif (($datamapping0712[$headers[$c]]) == "supplierABN") { + if ($data[$c] > 0 && $data[$c] != '0') { + $contractNoticeInsert[] = $data[$c]; + } else { + $contractNoticeInsert[] = null; + } + } elseif (($datamapping0712[$headers[$c]]) == "amendDate" || ($datamapping0712[$headers[$c]]) == "publishDate" || ($datamapping0712[$headers[$c]]) == "contractStart" || ($datamapping0712[$headers[$c]]) == "contractEnd") { + $contractNoticeInsert[] = date('Y-m-d H:i:s', strtotime($data[$c])); + } else { + if (strstr("\" =", $data[$c] > 0)) { + die("Invalid Description field" . $contractNoticeInsert); + } + $colvalue = preg_replace('/[^[:print:]]/', '', utf8_encode($data[$c])); + + $contractNoticeInsert[] = $colvalue; + } + } + } + flush(); + //print_r($contractNoticeInsert); + $contractNoticeInsertQ->execute($contractNoticeInsert); + $errors = $conn->errorInfo(); + if ($errors[1] == 7 && strpos($errors[2], "duplicate key")) { + // echo "dupe
"; + } elseif ($errors[1] == 0) { + $success++; + } else { + foreach ($contractNoticeFields as $key => $cnf) { + echo var_dump($contractNoticeInsert[$key]) . $cnf . "
"; + } + echo $data[2] . " failed CN insert.
" . print_r($errors, true) . "
row $row

\n"; + } + + flush(); + //echo "
\n"; + } + $row++; + } + fclose($handle); + $contractNoticeInsertQ->closeCursor(); + + return $success; +} + +$path = 'datagovdata/'; +if ($_REQUEST["fname"] == "" && $argv[1] == "") { + echo "Get files from: https://www.tenders.gov.au/?event=public.reports.list
"; + $dhandle = opendir($path); + // define an array to hold the files + $files = array(); + if ($dhandle) { + // loop through all of the files + while (false !== ($fname = readdir($dhandle))) { + if (($fname != '.') && ($fname != '..')) { + $files[date("c", filemtime($path . $fname)) . md5($fname)] = $fname; + } + } + } + ksort($files); + foreach ($files as $date => $fname) { + echo "$fname " . filesize($path . $fname) . " " . $date . "
"; + } +} else { + $success = 0; + $fname = $_REQUEST["fname"]; + if ($argv[1] != "") $fname = $argv[1]; + echo " ============== $fname ==============
"; + flush(); + $success+= processFile($path . $fname, "contractnotice"); + $success+= processFile($path . $fname, "agency"); + $success+= processFile($path . $fname, "supplier"); + echo "
$success records successfully created"; + + flush(); + // run post import data processing + // + if ($success > 0) { +$conn->exec("update datasets set \"lastUpdated\" = NOW() where title = 'Contract Notices'"); + echo "link amend
"; + include ("linkAmendments.php"); + echo "update UNSPSC
"; + include ("updateUNSPSC.php"); + } +// cn + +// agency +//include ("setAgencyStatus.php"); +//include ("setAgencyURLABN.php"); + +} +?> + --- a/admin/neo4jimporter/pom.xml +++ b/admin/neo4jimporter/pom.xml @@ -7,12 +7,13 @@ org.neo4j neo4j - 1.8.RC1 + 2.0.0-M03 postgresql postgresql - 9.0-801.jdbc4 + 9.1-901.jdbc4 + --- a/admin/neo4jimporter/src/main/java/Importer.java +++ b/admin/neo4jimporter/src/main/java/Importer.java @@ -5,25 +5,34 @@ import java.sql.SQLException; import java.sql.SQLWarning; import java.sql.Statement; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Writer; import java.util.HashMap; +import java.util.Map; +import org.neo4j.graphdb.Direction; +import org.neo4j.graphdb.DynamicLabel; import org.neo4j.graphdb.DynamicRelationshipType; import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Label; import org.neo4j.graphdb.Node; -import org.neo4j.graphdb.index.BatchInserterIndex; -import org.neo4j.graphdb.index.BatchInserterIndexProvider; +import org.neo4j.graphdb.RelationshipType; import org.neo4j.helpers.collection.MapUtil; -import org.neo4j.index.impl.lucene.LuceneBatchInserterIndexProvider; -import org.neo4j.kernel.impl.batchinsert.BatchInserter; -import org.neo4j.kernel.impl.batchinsert.BatchInserterImpl; +import org.neo4j.unsafe.batchinsert.BatchInserter; +import org.neo4j.unsafe.batchinsert.BatchInserters; public class Importer { public static void main(String[] argv) { - BatchInserter inserter = new BatchInserterImpl("target/neo4jdb-batchinsert"); - BatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(inserter); - BatchInserterIndex labels = indexProvider.nodeIndex("labels", MapUtil.stringMap("type", "exact")); - labels.setCacheCapacity("Label", 100000); +Map config = new HashMap(); +config.put( "neostore.nodestore.db.mapped_memory", "90M" ); +BatchInserter inserter = BatchInserters.inserter("target/batchinserter-example-config", config ); + //BatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(inserter); + //BatchInserterIndex names = indexProvider.nodeIndex("names", MapUtil.stringMap("type", "exact")); + //names.setCacheCapacity("name", 100000); @@ -78,52 +87,56 @@ // Execute the query ResultSet rs = stmt.executeQuery("SELECT contractnotice.\"agencyName\", " - + " contractnotice.\"supplierABN\",contractnotice.\"supplierName\",sum(value) as sum " + + " (case when \"supplierABN\" != 0 THEN \"supplierABN\"::text ELSE \"supplierName\" END) as supplierID , max(contractnotice.\"supplierName\") as \"supplierName\",sum(value) as sum " + "FROM public.contractnotice GROUP BY contractnotice.\"agencyName\", " - + " contractnotice.\"supplierABN\",contractnotice.\"supplierName\""); - String previousAgency = ""; - GraphDatabaseService gds = inserter.getGraphDbService(); + + " (case when \"supplierABN\" != 0 THEN \"supplierABN\"::text ELSE \"supplierName\" END)"); HashMap supplierIDs = new HashMap(); HashMap agencyIDs = new HashMap(); + +Label agencyLabel = DynamicLabel.label( "Agency" ); +inserter.createDeferredSchemaIndex( agencyLabel ).on( "name" ); +Label supplierLabel = DynamicLabel.label( "Supplier" ); +inserter.createDeferredSchemaIndex( agencyLabel ).on( "name" ); // Loop through the result set while (rs.next()) { long supplierID, agencyID; String supplierKey; if (agencyIDs.get(rs.getString("agencyName")) == null) { - Node myNode = gds.createNode(); - myNode.setProperty("Label", rs.getString("agencyName")); - myNode.setProperty("type", "agency"); - agencyIDs.put(rs.getString("agencyName"), myNode.getId()); - if (myNode.getId() % 100 == 0) { - System.out.println("Agency " + myNode.getId()); + Map properties = new HashMap(); + properties.put("name", rs.getString("agencyName")); + properties.put("type", rs.getString("agency")); + agencyID = inserter.createNode(properties, agencyLabel); + agencyIDs.put(rs.getString("agencyName"), agencyID); + if (agencyID % 10 == 0) { + System.out.println("Agency " + agencyID); } } agencyID = agencyIDs.get(rs.getString("agencyName")); - if (rs.getString("supplierABN") != "0" && rs.getString("supplierABN") != "") { - supplierKey = rs.getString("supplierABN"); - } else { - supplierKey = rs.getString("supplierName"); - } // inject some data - if (supplierIDs.get(supplierKey) == null) { - Node myNode = gds.createNode(); - myNode.setProperty("Label", rs.getString("supplierName")); - myNode.setProperty("type", "supplier"); - supplierIDs.put(supplierKey, myNode.getId()); - if (myNode.getId() % 1000 == 0) { - System.out.println("Supplier " + myNode.getId()); + if (supplierIDs.get(rs.getString("supplierID")) == null) { + Map properties = new HashMap(); + properties.put("name", rs.getString("supplierName")); + properties.put("type", rs.getString("supplier")); + supplierID = inserter.createNode(properties, supplierLabel); + supplierIDs.put(rs.getString("supplierID"), supplierID); + if (supplierID % 1000 == 0) { + System.out.println("Supplier " + supplierID); } } - supplierID = supplierIDs.get(supplierKey); + supplierID = supplierIDs.get(rs.getString("supplierID")); - long rel = inserter.createRelationship(agencyID, supplierID, - DynamicRelationshipType.withName("KNOWS"), null); - inserter.setRelationshipProperty(rel, "Weight", rs.getDouble("sum")); - +// To set properties on the relationship, use a properties map +// instead of null as the last parameter. +Map properties = new HashMap(); +properties.put( "value", rs.getDouble("sum")); + inserter.createRelationship(agencyID, supplierID, + DynamicRelationshipType.withName("PAYS"), properties); + inserter.createRelationship(supplierID, agencyID, + DynamicRelationshipType.withName("PAID_BY"), properties); } // Close the result set, statement and the connection rs.close(); @@ -142,10 +155,11 @@ } } //make the changes visible for reading, use this sparsely, requires IO! - labels.flush(); +// names.flush(); // Make sure to shut down the index provider - indexProvider.shutdown(); +// indexProvider.shutdown(); inserter.shutdown(); } -} +} + --- a/admin/partialdata/import.php +++ b/admin/partialdata/import.php @@ -12,6 +12,7 @@ $handle = fopen($fpath, "r"); //"t" mode string translates windows line breaks to unix $datamapping0507 = array( + "Agency" => "agencyName", "CN ID" => "CNID", "Publish Date" => "publishDate", @@ -137,7 +138,7 @@ // loop through all of the files while (false !== ($fname = readdir($dhandle))) { if (($fname != '.') && ($fname != '..') && (!isset($_REQUEST["filter"]) || strpos($fname,$_REQUEST["filter"]) != false)) { - echo "$fname " . filesize($path . $fname) . " " . date("c", filemtime($path . $fname)) . "
"; + echo "$fname " . filesize($path . $fname) . " " . date("c", filemtime($path . $fname)) . "
"; processFile($path . $fname, "contractnotice"); } } --- a/admin/partialdata/importamendments.php +++ b/admin/partialdata/importamendments.php @@ -310,7 +310,7 @@ // loop through all of the files while (false !== ($fname = readdir($dhandle))) { if (($fname != '.') && ($fname != '..') && (strpos($fname,".xls")>0)) { - echo "$fname " . filesize($path . $fname) . " " . date("c", filemtime($path . $fname)) . "
"; + echo "$fname " . filesize($path . $fname) . " " . date("c", filemtime($path . $fname)) . "
"; processFile($path . $fname, "contractnotice"); } } --- a/exportOverview.csv.php +++ b/exportOverview.csv.php @@ -11,10 +11,11 @@ $unspsc[$row['UNSPSC']] = $row['Title']; } +//(\'https://www.tenders.gov.au/?event=public.advancedsearch.keyword&keyword=CN\'::text || "CNID"::text) as url $query = $conn->prepare(' -SELECT "CNID" as uid, description as text, -(\'https://www.tenders.gov.au/?event=public.advancedsearch.keyword&keyword=CN\'::text || "CNID"::text) as url from "contractnotice" -where "childCN" is null' +SELECT "CNID" as id, description as text +from "contractnotice" +where "childCN" is null limit 10000' , array(PDO::ATTR_CURSOR => PDO::FETCH_ORI_NEXT)); $query->execute(); $errors = $conn->errorInfo(); --- a/lib/common.inc.php +++ b/lib/common.inc.php @@ -314,6 +314,3 @@ include ("graphs.inc.php"); - - -