beginnings of NAA data import
[disclosr.git] / admin / neo4jimporter / src / main / java / StAXSample.java
blob:a/admin/neo4jimporter/src/main/java/StAXSample.java -> blob:b/admin/neo4jimporter/src/main/java/StAXSample.java
--- a/admin/neo4jimporter/src/main/java/StAXSample.java
+++ b/admin/neo4jimporter/src/main/java/StAXSample.java
@@ -1,1 +1,375 @@
-
+import org.neo4j.graphdb.DynamicLabel;
+import org.neo4j.graphdb.DynamicRelationshipType;
+import org.neo4j.graphdb.Label;
+import org.neo4j.unsafe.batchinsert.BatchInserter;
+import org.neo4j.unsafe.batchinsert.BatchInserters;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.events.XMLEvent;
+
+public class StAXSample {
+
+
+    HashMap<String, Long> agencyIDs = new HashMap<String, Long>();
+    HashMap<String, Boolean> agencyFullVersion = new HashMap<String, Boolean>();
+    Label agencyLabel = DynamicLabel.label("Agency");
+    HashMap<String, Long> locationIDs = new HashMap<String, Long>();   
+    Label locationLabel = DynamicLabel.label("Location");
+    HashMap<String, Long> functionIDs = new HashMap<String, Long>();
+    Label functionLabel = DynamicLabel.label("Function");
+    HashMap<String, Long> statusIDs = new HashMap<String, Long>();
+    Label statusLabel = DynamicLabel.label("Location");
+    BatchInserter inserter;
+
+    private String filename;
+
+    public StAXSample() {
+    }
+
+    public static void main(String[] args) {
+        if (args.length != 1) {
+            System.out.println("Usage: StAXSample file.xml");
+            System.exit(-1);
+        }
+
+        StAXSample ss = new StAXSample();
+        ss.setFilename(args[0]);
+        ss.run();
+    }
+
+    public void run() {
+
+        Map<String, String> config = new HashMap<String, String>();
+        config.put("neostore.nodestore.db.mapped_memory", "90M");
+        inserter = BatchInserters.inserter("target/batchinserter-example-config", config);
+        inserter.createDeferredSchemaIndex(agencyLabel).on("agency_no");
+        inserter.createDeferredSchemaIndex(locationLabel).on("location_name");
+        inserter.createDeferredSchemaIndex(functionLabel).on("thesaurus_term");
+        inserter.createDeferredSchemaIndex(statusLabel).on("status_name");
+
+        try {
+            XMLInputFactory xmlif = XMLInputFactory.newInstance();
+            xmlif.setProperty(
+                    XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES,
+                    Boolean.TRUE);
+            xmlif.setProperty(
+                    XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES,
+                    Boolean.FALSE);
+            //set the IS_COALESCING property to true 
+            //to get whole text data as one event.           
+            xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
+
+            try {
+                XMLEventReader r = null;
+                r = xmlif.createXMLEventReader(
+                        filename,
+                        //new FileInputStream(new File(xmlFileURL.toURI())));
+                        new FileInputStream(new File(filename)));
+
+
+                //iterate as long as there are more events on the input stream
+                while (r.hasNext()) {
+                    XMLEvent e = r.nextEvent();
+                    Map<String, Object> previousAgency = new HashMap<String, Object>();
+                    if (e.isStartElement()) {
+                        if (hasStartTagName(e, "AGENCIES")) {
+                            System.out.println("Agencies file loaded... ");
+                        } else if (hasStartTagName(e, "TITLE")) {
+                            System.out.println("TITLE is: " + getCharacters(r));
+                            previousAgency.put("title", getCharacters(r));
+                        } else if (hasStartTagName(e, "END_DATE_QUAL")) {
+                            System.out.println("END_DATE_QUAL is: " + getCharacters(r));
+                            previousAgency.put("end_date_qual", getCharacters(r));
+                            // save agency
+                            getAgency(previousAgency);
+                            previousAgency = new HashMap<String, Object>();
+                        } else if (hasStartTagName(e, "AGENCY_LINK")) {
+                            processAgencyLink(r);
+                        } else if (hasStartTagName(e, "AGENCY_LOCATION")) {
+                            processAgencyLocation(r);
+                        } else if (hasStartTagName(e, "AGENCY_FUNCTION")) {
+                            processAgencyFunction(r);
+                        } else if (hasStartTagName(e, "AGENCY_STATUS")) {
+                            processAgencyStatus(r);
+                        } else {
+                            System.out.println("Unhandled tag: " + getStartTagName(e) + " content:" + getCharacters(r));
+                        }
+                    }
+                }
+            } catch (XMLStreamException ex) {
+                System.out.println(ex.getMessage());
+
+                if (ex.getNestedException() != null) {
+                    ex.getNestedException().printStackTrace();
+                }
+            }
+
+        } catch (FileNotFoundException ex) {
+            System.err.println("Error.  Cannot find \"" + filename + "\" in classpath.");
+            ex.printStackTrace();
+        } catch (Exception ex) {
+            ex.printStackTrace();
+        }
+
+        inserter.shutdown();
+    }
+
+    private long getAgency(Map<String, Object> properties) {
+        if (agencyIDs.get(properties.get("agency_no").toString()) == null) {
+            long agencyID = inserter.createNode(properties, agencyLabel);
+            if (properties.values().size() > 2) {
+                agencyFullVersion.put(properties.get("agency_no").toString(), true);
+            }
+            agencyIDs.put(properties.get("agency_no").toString(), agencyID);
+            return agencyID;
+        } else {
+            long agencyID = agencyIDs.get(properties.get("agency_no").toString());
+            if (properties.values().size() > 2 && agencyFullVersion.get(properties.get("agency_no")) == null) {
+                inserter.setNodeProperties(agencyID, properties);
+                agencyFullVersion.put(properties.get("agency_no").toString(), true);
+            }
+            return agencyID;
+        }
+    }
+
+    private long getLocation(String locationName) {
+        if (locationIDs.get(locationName) == null) {
+            HashMap properties = new HashMap< String,Object > ();
+            properties.put("location_name", locationName);
+            long locationID = inserter.createNode(properties, locationLabel);
+            locationIDs.put(locationName, locationID);
+            return locationID;
+        } else {
+            return locationIDs.get(locationName);
+        }
+    }
+    private long getFunction(String functionName) {
+        if (functionIDs.get(functionName) == null) {
+            HashMap properties = new HashMap< String,Object > ();
+            properties.put("function_name", functionName);
+            long functionID = inserter.createNode(properties, functionLabel);
+            functionIDs.put(functionName, functionID);
+            return functionID;
+        } else {
+            return functionIDs.get(functionName);
+        }
+    }
+    private long getStatus(String statusName) {
+        if (statusIDs.get(statusName) == null) {
+            HashMap properties = new HashMap< String,Object > ();
+            properties.put("status_name", statusName);
+            long statusID = inserter.createNode(properties, statusLabel);
+            statusIDs.put(statusName, statusID);
+            return statusID;
+        } else {
+            return statusIDs.get(statusName);
+        }
+    }
+
+    private void processAgencyLink(XMLEventReader rdr) throws Exception {
+        String agency_from_no = null;
+        String agency_to_no = null;
+        String link_type = null;
+        String start_date = null;
+        String start_date_qual = null;
+        String end_date = null;
+        String end_date_qual = null;
+
+        while (rdr.hasNext()) {
+            XMLEvent e = rdr.nextEvent();
+            if (e.isStartElement()) {
+                if (hasStartTagName(e, "LINK_AGENCY_NO")) {
+                    agency_from_no = getCharacters(rdr);
+                } else if (hasStartTagName(e, "LINK_TO_AGENCY_NO")) {
+                    agency_to_no = getCharacters(rdr);
+                } else if (hasStartTagName(e, "LINK_TYPE")) {
+                    link_type = getCharacters(rdr);
+                }  else if (hasStartTagName(e, "START_DATE")) {
+                    start_date = getCharacters(rdr);
+                }else if (hasStartTagName(e, "START_DATE_QUAL")) {
+                    start_date_qual = getCharacters(rdr);
+                }else if (hasStartTagName(e, "END_DATE")) {
+                    end_date = getCharacters(rdr);
+                }else if (hasStartTagName(e, "END_DATE_QUAL")) {
+                    end_date_qual = getCharacters(rdr);
+                }
+            }
+            if (e.isEndElement()) {
+                if (hasEndTagName(e, "AGENCY_LINK")) {
+
+                    //System.out.println("Finished processing link:  Name = " + name + "; of = " + of + "; date = " + date);
+                    long agencyFromID, agencyToID;
+                    Map<String, Object> agencyFromProperties = new HashMap<String, Object>();
+                    agencyFromProperties.put("agency_no",agency_from_no);
+                    agencyFromID = getAgency(agencyFromProperties);
+                    Map<String, Object> agencyToProperties = new HashMap<String, Object>();
+                    agencyToProperties.put("agency_no",agency_to_no);
+                    agencyToID = getAgency(agencyToProperties);
+                    Map<String, Object> relProperties = new HashMap<String, Object>();
+                    relProperties.put("link_type", link_type);
+                    relProperties.put("start_date", start_date);
+                    relProperties.put("start_date_qual", start_date_qual);
+                    relProperties.put("end_date", end_date);
+                    relProperties.put("end_date_qual", end_date_qual);
+                    inserter.createRelationship(agencyFromID, agencyToID,
+                            DynamicRelationshipType.withName("IS_LINKED_TO"), relProperties);
+
+                    break;
+                }
+            }
+        }
+    }
+
+    private void processAgencyLocation(XMLEventReader rdr) throws Exception {
+        String of = null;
+        String name = null;
+        String date = null;
+
+        while (rdr.hasNext()) {
+            XMLEvent e = rdr.nextEvent();
+            if (e.isStartElement()) {
+                if (hasStartTagName(e, "LOCATION_AGENCY_NO")) {
+                    of = getCharacters(rdr);
+                } else if (hasStartTagName(e, "LOCATION_TEXT")) {
+                    name = getCharacters(rdr);
+                } else if (hasStartTagName(e, "LOCATION_DATE")) {
+                    date = getCharacters(rdr);
+                }
+            }
+            if (e.isEndElement()) {
+                if (hasEndTagName(e, "AGENCY_LOCATION")) {
+                    System.out.println("Finished processing location:  Name = " + name + "; of = " + of + "; date = " + date);
+                    long locationID, agencyID;
+                    locationID = getLocation(name);
+                    Map<String, Object> agencyProperties = new HashMap<String, Object>();
+                    agencyProperties.put("agency_no",of);
+                    agencyID = getAgency(agencyProperties);
+                    Map<String, Object> relProperties = new HashMap<String, Object>();
+                    relProperties.put("date", date);
+                    inserter.createRelationship(agencyID, locationID,
+                            DynamicRelationshipType.withName("HAS_LOCATION"), relProperties);
+
+                    break;
+                }
+            }
+        }
+    }
+
+    private void processAgencyStatus(XMLEventReader rdr) throws Exception {
+        String of = null;
+        String status = null;
+        String date = null;
+
+        while (rdr.hasNext()) {
+            XMLEvent e = rdr.nextEvent();
+            if (e.isStartElement()) {
+                if (hasStartTagName(e, "STATUS_AGENCY_NO")) {
+                    of = getCharacters(rdr);
+                } else if (hasStartTagName(e, "STATUS")) {
+                    status = getCharacters(rdr);
+                } else if (hasStartTagName(e, "STATUS_DATE")) {
+                    date = getCharacters(rdr);
+                }
+            }
+            if (e.isEndElement()) {
+                if (hasEndTagName(e, "AGENCY_STATUS")) {
+                    System.out.println("Finished processing status:  Status = " + status + "; of = " + of + "; date = " + date);
+                    long statusID, agencyID;
+                            statusID = getStatus(status);
+                    Map<String, Object> agencyProperties = new HashMap<String, Object>();
+                    agencyProperties.put("agency_no",of);
+                    agencyID = getAgency(agencyProperties);
+                    Map<String, Object> relProperties = new HashMap<String, Object>();
+                    relProperties.put("date", date);
+                    inserter.createRelationship(agencyID, statusID,
+                            DynamicRelationshipType.withName("HAS_STATUS"), relProperties);
+
+                    break;
+                }
+            }
+        }
+    }
+
+    private void processAgencyFunction(XMLEventReader rdr) throws Exception {
+        String agency = null;
+        String thesaurus_term = null;
+        String start_date = null;
+        String start_date_qual = null;
+        String end_date = null;
+        String end_date_qual = null;
+
+        while (rdr.hasNext()) {
+            XMLEvent e = rdr.nextEvent();
+            if (e.isStartElement()) {
+                if (hasStartTagName(e, "FUNCTION_AGENCY_NO")) {
+                    agency = getCharacters(rdr);
+                } else if (hasStartTagName(e, "THESAURUS_TERM")) {
+                    thesaurus_term = getCharacters(rdr);
+                } else if (hasStartTagName(e, "START_DATE")) {
+                    start_date = getCharacters(rdr);
+                }else if (hasStartTagName(e, "START_DATE_QUAL")) {
+                    start_date_qual = getCharacters(rdr);
+                }else if (hasStartTagName(e, "END_DATE")) {
+                    end_date = getCharacters(rdr);
+                }else if (hasStartTagName(e, "END_DATE_QUAL")) {
+                    end_date_qual = getCharacters(rdr);
+                }
+            }
+            if (e.isEndElement()) {
+                if (hasEndTagName(e, "AGENCY_FUNCTION")) {
+                    //System.out.println("Finished processing function:  Name = " + name + "; of = " + of + "; date = " + date);
+                    long functionID, agencyID;
+                    functionID = getFunction(thesaurus_term);
+                    Map<String, Object> agencyProperties = new HashMap<String, Object>();
+                    agencyProperties.put("agency_no",agency);
+                    agencyID = getAgency(agencyProperties);
+                    Map<String, Object> relProperties = new HashMap<String, Object>();
+                                        relProperties.put("start_date", start_date);
+                    relProperties.put("start_date_qual", start_date_qual);
+                    relProperties.put("end_date", end_date);
+                    relProperties.put("end_date_qual", end_date_qual);
+                    inserter.createRelationship(agencyID, functionID,
+                            DynamicRelationshipType.withName("HAS_FUNCTION"), relProperties);
+
+                    break;
+                }
+            }
+        }
+    }
+
+    private String getCharacters(XMLEventReader rdr) throws XMLStreamException {
+        XMLEvent e = rdr.nextEvent();
+        if (e.isCharacters()) {
+            return e.asCharacters().getData();
+        } else {
+            return null;
+        }
+    }
+
+    private boolean hasStartTagName(XMLEvent e, String name) {
+        return e.asStartElement().getName().getLocalPart().equals(name);
+    }
+
+    private String getStartTagName(XMLEvent e) {
+        return e.asStartElement().getName().getLocalPart();
+    }
+
+    private boolean hasEndTagName(XMLEvent e, String name) {
+        return e.asEndElement().getName().getLocalPart().equals(name);
+    }
+
+    public void setFilename(String filename) {
+        this.filename = filename;
+    }
+
+
+}
+