beginnings of NAA data import
[disclosr.git] / admin / neo4jimporter / src / main / java / StAXSample.java
blob:a/admin/neo4jimporter/src/main/java/StAXSample.java -> blob:b/admin/neo4jimporter/src/main/java/StAXSample.java
  import org.neo4j.graphdb.DynamicLabel;
  import org.neo4j.graphdb.DynamicRelationshipType;
  import org.neo4j.graphdb.Label;
  import org.neo4j.unsafe.batchinsert.BatchInserter;
  import org.neo4j.unsafe.batchinsert.BatchInserters;
   
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.FileNotFoundException;
  import java.util.HashMap;
  import java.util.Map;
   
  import javax.xml.stream.XMLInputFactory;
  import javax.xml.stream.XMLStreamException;
  import javax.xml.stream.XMLEventReader;
  import javax.xml.stream.events.XMLEvent;
   
  public class StAXSample {
   
   
  HashMap<String, Long> agencyIDs = new HashMap<String, Long>();
  HashMap<String, Boolean> agencyFullVersion = new HashMap<String, Boolean>();
  Label agencyLabel = DynamicLabel.label("Agency");
  HashMap<String, Long> locationIDs = new HashMap<String, Long>();
  Label locationLabel = DynamicLabel.label("Location");
  HashMap<String, Long> functionIDs = new HashMap<String, Long>();
  Label functionLabel = DynamicLabel.label("Function");
  HashMap<String, Long> statusIDs = new HashMap<String, Long>();
  Label statusLabel = DynamicLabel.label("Location");
  BatchInserter inserter;
   
  private String filename;
   
  public StAXSample() {
  }
   
  public static void main(String[] args) {
  if (args.length != 1) {
  System.out.println("Usage: StAXSample file.xml");
  System.exit(-1);
  }
   
  StAXSample ss = new StAXSample();
  ss.setFilename(args[0]);
  ss.run();
  }
   
  public void run() {
   
  Map<String, String> config = new HashMap<String, String>();
  config.put("neostore.nodestore.db.mapped_memory", "90M");
  inserter = BatchInserters.inserter("target/batchinserter-example-config", config);
  inserter.createDeferredSchemaIndex(agencyLabel).on("agency_no");
  inserter.createDeferredSchemaIndex(locationLabel).on("location_name");
  inserter.createDeferredSchemaIndex(functionLabel).on("thesaurus_term");
  inserter.createDeferredSchemaIndex(statusLabel).on("status_name");
   
  try {
  XMLInputFactory xmlif = XMLInputFactory.newInstance();
  xmlif.setProperty(
  XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES,
  Boolean.TRUE);
  xmlif.setProperty(
  XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES,
  Boolean.FALSE);
  //set the IS_COALESCING property to true
  //to get whole text data as one event.
  xmlif.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
   
  try {
  XMLEventReader r = null;
  r = xmlif.createXMLEventReader(
  filename,
  //new FileInputStream(new File(xmlFileURL.toURI())));
  new FileInputStream(new File(filename)));
   
   
  //iterate as long as there are more events on the input stream
  while (r.hasNext()) {
  XMLEvent e = r.nextEvent();
  Map<String, Object> previousAgency = new HashMap<String, Object>();
  if (e.isStartElement()) {
  if (hasStartTagName(e, "AGENCIES")) {
  System.out.println("Agencies file loaded... ");
  } else if (hasStartTagName(e, "TITLE")) {
  System.out.println("TITLE is: " + getCharacters(r));
  previousAgency.put("title", getCharacters(r));
  } else if (hasStartTagName(e, "END_DATE_QUAL")) {
  System.out.println("END_DATE_QUAL is: " + getCharacters(r));
  previousAgency.put("end_date_qual", getCharacters(r));
  // save agency
  getAgency(previousAgency);
  previousAgency = new HashMap<String, Object>();
  } else if (hasStartTagName(e, "AGENCY_LINK")) {
  processAgencyLink(r);
  } else if (hasStartTagName(e, "AGENCY_LOCATION")) {
  processAgencyLocation(r);
  } else if (hasStartTagName(e, "AGENCY_FUNCTION")) {
  processAgencyFunction(r);
  } else if (hasStartTagName(e, "AGENCY_STATUS")) {
  processAgencyStatus(r);
  } else {
  System.out.println("Unhandled tag: " + getStartTagName(e) + " content:" + getCharacters(r));
  }
  }
  }
  } catch (XMLStreamException ex) {
  System.out.println(ex.getMessage());
   
  if (ex.getNestedException() != null) {
  ex.getNestedException().printStackTrace();
  }
  }
   
  } catch (FileNotFoundException ex) {
  System.err.println("Error. Cannot find \"" + filename + "\" in classpath.");
  ex.printStackTrace();
  } catch (Exception ex) {
  ex.printStackTrace();
  }
   
  inserter.shutdown();
  }
   
  private long getAgency(Map<String, Object> properties) {
  if (agencyIDs.get(properties.get("agency_no").toString()) == null) {
  long agencyID = inserter.createNode(properties, agencyLabel);
  if (properties.values().size() > 2) {
  agencyFullVersion.put(properties.get("agency_no").toString(), true);
  }
  agencyIDs.put(properties.get("agency_no").toString(), agencyID);
  return agencyID;
  } else {
  long agencyID = agencyIDs.get(properties.get("agency_no").toString());
  if (properties.values().size() > 2 && agencyFullVersion.get(properties.get("agency_no")) == null) {
  inserter.setNodeProperties(agencyID, properties);
  agencyFullVersion.put(properties.get("agency_no").toString(), true);
  }
  return agencyID;
  }
  }
   
  private long getLocation(String locationName) {
  if (locationIDs.get(locationName) == null) {
  HashMap properties = new HashMap< String,Object > ();
  properties.put("location_name", locationName);
  long locationID = inserter.createNode(properties, locationLabel);
  locationIDs.put(locationName, locationID);
  return locationID;
  } else {
  return locationIDs.get(locationName);
  }
  }
  private long getFunction(String functionName) {
  if (functionIDs.get(functionName) == null) {
  HashMap properties = new HashMap< String,Object > ();
  properties.put("function_name", functionName);
  long functionID = inserter.createNode(properties, functionLabel);
  functionIDs.put(functionName, functionID);
  return functionID;
  } else {
  return functionIDs.get(functionName);
  }
  }
  private long getStatus(String statusName) {
  if (statusIDs.get(statusName) == null) {
  HashMap properties = new HashMap< String,Object > ();
  properties.put("status_name", statusName);
  long statusID = inserter.createNode(properties, statusLabel);
  statusIDs.put(statusName, statusID);
  return statusID;
  } else {
  return statusIDs.get(statusName);
  }
  }
   
  private void processAgencyLink(XMLEventReader rdr) throws Exception {
  String agency_from_no = null;
  String agency_to_no = null;
  String link_type = null;
  String start_date = null;
  String start_date_qual = null;
  String end_date = null;
  String end_date_qual = null;
   
  while (rdr.hasNext()) {
  XMLEvent e = rdr.nextEvent();
  if (e.isStartElement()) {
  if (hasStartTagName(e, "LINK_AGENCY_NO")) {
  agency_from_no = getCharacters(rdr);
  } else if (hasStartTagName(e, "LINK_TO_AGENCY_NO")) {
  agency_to_no = getCharacters(rdr);
  } else if (hasStartTagName(e, "LINK_TYPE")) {
  link_type = getCharacters(rdr);
  } else if (hasStartTagName(e, "START_DATE")) {
  start_date = getCharacters(rdr);
  }else if (hasStartTagName(e, "START_DATE_QUAL")) {
  start_date_qual = getCharacters(rdr);
  }else if (hasStartTagName(e, "END_DATE")) {
  end_date = getCharacters(rdr);
  }else if (hasStartTagName(e, "END_DATE_QUAL")) {
  end_date_qual = getCharacters(rdr);
  }
  }
  if (e.isEndElement()) {
  if (hasEndTagName(e, "AGENCY_LINK")) {
   
  //System.out.println("Finished processing link: Name = " + name + "; of = " + of + "; date = " + date);
  long agencyFromID, agencyToID;
  Map<String, Object> agencyFromProperties = new HashMap<String, Object>();
  agencyFromProperties.put("agency_no",agency_from_no);
  agencyFromID = getAgency(agencyFromProperties);
  Map<String, Object> agencyToProperties = new HashMap<String, Object>();
  agencyToProperties.put("agency_no",agency_to_no);
  agencyToID = getAgency(agencyToProperties);
  Map<String, Object> relProperties = new HashMap<String, Object>();
  relProperties.put("link_type", link_type);
  relProperties.put("start_date", start_date);
  relProperties.put("start_date_qual", start_date_qual);
  relProperties.put("end_date", end_date);
  relProperties.put("end_date_qual", end_date_qual);
  inserter.createRelationship(agencyFromID, agencyToID,
  DynamicRelationshipType.withName("IS_LINKED_TO"), relProperties);
   
  break;
  }
  }
  }
  }
   
  private void processAgencyLocation(XMLEventReader rdr) throws Exception {
  String of = null;
  String name = null;
  String date = null;
   
  while (rdr.hasNext()) {
  XMLEvent e = rdr.nextEvent();
  if (e.isStartElement()) {
  if (hasStartTagName(e, "LOCATION_AGENCY_NO")) {
  of = getCharacters(rdr);
  } else if (hasStartTagName(e, "LOCATION_TEXT")) {
  name = getCharacters(rdr);
  } else if (hasStartTagName(e, "LOCATION_DATE")) {
  date = getCharacters(rdr);
  }
  }
  if (e.isEndElement()) {
  if (hasEndTagName(e, "AGENCY_LOCATION")) {
  System.out.println("Finished processing location: Name = " + name + "; of = " + of + "; date = " + date);
  long locationID, agencyID;
  locationID = getLocation(name);
  Map<String, Object> agencyProperties = new HashMap<String, Object>();
  agencyProperties.put("agency_no",of);
  agencyID = getAgency(agencyProperties);
  Map<String, Object> relProperties = new HashMap<String, Object>();
  relProperties.put("date", date);
  inserter.createRelationship(agencyID, locationID,
  DynamicRelationshipType.withName("HAS_LOCATION"), relProperties);
   
  break;
  }
  }
  }
  }
   
  private void processAgencyStatus(XMLEventReader rdr) throws Exception {
  String of = null;
  String status = null;
  String date = null;
   
  while (rdr.hasNext()) {
  XMLEvent e = rdr.nextEvent();
  if (e.isStartElement()) {
  if (hasStartTagName(e, "STATUS_AGENCY_NO")) {
  of = getCharacters(rdr);
  } else if (hasStartTagName(e, "STATUS")) {
  status = getCharacters(rdr);
  } else if (hasStartTagName(e, "STATUS_DATE")) {
  date = getCharacters(rdr);
  }
  }
  if (e.isEndElement()) {
  if (hasEndTagName(e, "AGENCY_STATUS")) {
  System.out.println("Finished processing status: Status = " + status + "; of = " + of + "; date = " + date);
  long statusID, agencyID;
  statusID = getStatus(status);
  Map<String, Object> agencyProperties = new HashMap<String, Object>();
  agencyProperties.put("agency_no",of);
  agencyID = getAgency(agencyProperties);
  Map<String, Object> relProperties = new HashMap<String, Object>();
  relProperties.put("date", date);
  inserter.createRelationship(agencyID, statusID,
  DynamicRelationshipType.withName("HAS_STATUS"), relProperties);
   
  break;
  }
  }
  }
  }
   
  private void processAgencyFunction(XMLEventReader rdr) throws Exception {
  String agency = null;
  String thesaurus_term = null;
  String start_date = null;
  String start_date_qual = null;
  String end_date = null;
  String end_date_qual = null;
   
  while (rdr.hasNext()) {
  XMLEvent e = rdr.nextEvent();
  if (e.isStartElement()) {
  if (hasStartTagName(e, "FUNCTION_AGENCY_NO")) {
  agency = getCharacters(rdr);
  } else if (hasStartTagName(e, "THESAURUS_TERM")) {
  thesaurus_term = getCharacters(rdr);
  } else if (hasStartTagName(e, "START_DATE")) {
  start_date = getCharacters(rdr);
  }else if (hasStartTagName(e, "START_DATE_QUAL")) {
  start_date_qual = getCharacters(rdr);
  }else if (hasStartTagName(e, "END_DATE")) {
  end_date = getCharacters(rdr);
  }else if (hasStartTagName(e, "END_DATE_QUAL")) {
  end_date_qual = getCharacters(rdr);
  }
  }
  if (e.isEndElement()) {
  if (hasEndTagName(e, "AGENCY_FUNCTION")) {
  //System.out.println("Finished processing function: Name = " + name + "; of = " + of + "; date = " + date);
  long functionID, agencyID;
  functionID = getFunction(thesaurus_term);
  Map<String, Object> agencyProperties = new HashMap<String, Object>();
  agencyProperties.put("agency_no",agency);
  agencyID = getAgency(agencyProperties);
  Map<String, Object> relProperties = new HashMap<String, Object>();
  relProperties.put("start_date", start_date);
  relProperties.put("start_date_qual", start_date_qual);
  relProperties.put("end_date", end_date);
  relProperties.put("end_date_qual", end_date_qual);
  inserter.createRelationship(agencyID, functionID,
  DynamicRelationshipType.withName("HAS_FUNCTION"), relProperties);