Upgrade origin-src to google transit feed 1.2.6
[bus.git] / origin-src / transitfeed-1.2.6 / merge.py
blob:a/origin-src/transitfeed-1.2.6/merge.py -> blob:b/origin-src/transitfeed-1.2.6/merge.py
  #!/usr/bin/python2.5
  #
  # Copyright 2007 Google Inc. All Rights Reserved.
  #
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  # http://www.apache.org/licenses/LICENSE-2.0
  #
  # Unless required by applicable law or agreed to in writing, software
  # distributed under the License is distributed on an "AS IS" BASIS,
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  # See the License for the specific language governing permissions and
  # limitations under the License.
   
  """A tool for merging two Google Transit feeds.
   
  Given two Google Transit feeds intending to cover two disjoint calendar
  intervals, this tool will attempt to produce a single feed by merging as much
  of the two feeds together as possible.
   
  For example, most stops remain the same throughout the year. Therefore, many
  of the stops given in stops.txt for the first feed represent the same stops
  given in the second feed. This tool will try to merge these stops so they
  only appear once in the resultant feed.
   
  A note on terminology: The first schedule is referred to as the "old" schedule;
  the second as the "new" schedule. The resultant schedule is referred to as
  the "merged" schedule. Names of things in the old schedule are variations of
  the letter "a" while names of things from the new schedule are variations of
  "b". The objects that represents routes, agencies and so on are called
  "entities".
   
  usage: merge.py [options] old_feed_path new_feed_path merged_feed_path
   
  Run merge.py --help for a list of the possible options.
  """
   
   
  __author__ = 'timothy.stranex@gmail.com (Timothy Stranex)'
   
   
  import datetime
  import optparse
  import os
  import re
  import sys
  import time
  import transitfeed
  from transitfeed import util
  import webbrowser
   
   
  # TODO:
  # 1. write unit tests that use actual data
  # 2. write a proper trip and stop_times merger
  # 3. add a serialised access method for stop_times and shapes to transitfeed
  # 4. add support for merging schedules which have some service period overlap
   
   
  def ApproximateDistanceBetweenPoints(pa, pb):
  """Finds the distance between two points on the Earth's surface.
   
  This is an approximate distance based on assuming that the Earth is a sphere.
  The points are specified by their lattitude and longitude.
   
  Args:
  pa: the first (lat, lon) point tuple
  pb: the second (lat, lon) point tuple
   
  Returns:
  The distance as a float in metres.
  """
  alat, alon = pa
  blat, blon = pb
  sa = transitfeed.Stop(lat=alat, lng=alon)
  sb = transitfeed.Stop(lat=blat, lng=blon)
  return transitfeed.ApproximateDistanceBetweenStops(sa, sb)
   
   
  class Error(Exception):
  """The base exception class for this module."""
   
   
  class MergeError(Error):
  """An error produced when two entities could not be merged."""
   
   
  class MergeProblemWithContext(transitfeed.ExceptionWithContext):
  """The base exception class for problem reporting in the merge module.
   
  Attributes:
  dataset_merger: The DataSetMerger that generated this problem.
  entity_type_name: The entity type of the dataset_merger. This is just
  dataset_merger.ENTITY_TYPE_NAME.
  ERROR_TEXT: The text used for generating the problem message.
  """
   
  def __init__(self, dataset_merger, problem_type=transitfeed.TYPE_WARNING,
  **kwargs):
  """Initialise the exception object.
   
  Args:
  dataset_merger: The DataSetMerger instance that generated this problem.
  problem_type: The problem severity. This should be set to one of the
  corresponding constants in transitfeed.
  kwargs: Keyword arguments to be saved as instance attributes.
  """
  kwargs['type'] = problem_type
  kwargs['entity_type_name'] = dataset_merger.ENTITY_TYPE_NAME
  transitfeed.ExceptionWithContext.__init__(self, None, None, **kwargs)
  self.dataset_merger = dataset_merger
   
  def FormatContext(self):
  return "In files '%s'" % self.dataset_merger.FILE_NAME
   
   
  class SameIdButNotMerged(MergeProblemWithContext):
  ERROR_TEXT = ("There is a %(entity_type_name)s in the old feed with id "
  "'%(id)s' and one from the new feed with the same id but "
  "they could not be merged:")
   
   
  class CalendarsNotDisjoint(MergeProblemWithContext):
  ERROR_TEXT = ("The service periods could not be merged since they are not "
  "disjoint.")
   
   
  class MergeNotImplemented(MergeProblemWithContext):
  ERROR_TEXT = ("The feed merger does not currently support merging in this "
  "file. The entries have been duplicated instead.")
   
   
  class FareRulesBroken(MergeProblemWithContext):
  ERROR_TEXT = ("The feed merger is currently unable to handle fare rules "
  "properly.")
   
   
  class MergeProblemReporter(transitfeed.ProblemReporter):
  """The base problem reporter class for the merge module."""
   
  def __init__(self, accumulator):
  transitfeed.ProblemReporter.__init__(self, accumulator)
   
  def SameIdButNotMerged(self, dataset, entity_id, reason):
  self.AddToAccumulator(
  SameIdButNotMerged(dataset, id=entity_id, reason=reason))
   
  def CalendarsNotDisjoint(self, dataset):
  self.AddToAccumulator(
  CalendarsNotDisjoint(dataset, problem_type=transitfeed.TYPE_ERROR))
   
  def MergeNotImplemented(self, dataset):
  self.AddToAccumulator(MergeNotImplemented(dataset))
   
  def FareRulesBroken(self, dataset):
  self.AddToAccumulator(FareRulesBroken(dataset))
   
   
  class HTMLProblemAccumulator(transitfeed.ProblemAccumulatorInterface):
  """A problem reporter which generates HTML output."""
   
  def __init__(self):
  """Initialise."""
  self._dataset_warnings = {} # a map from DataSetMergers to their warnings
  self._dataset_errors = {}
  self._warning_count = 0
  self._error_count = 0
   
  def _Report(self, merge_problem):
  if merge_problem.IsWarning():
  dataset_problems = self._dataset_warnings
  self._warning_count += 1
  else:
  dataset_problems = self._dataset_errors
  self._error_count += 1
   
  problem_html = '<li>%s</li>' % (
  merge_problem.FormatProblem().replace('\n', '<br>'))
  dataset_problems.setdefault(merge_problem.dataset_merger, []).append(
  problem_html)
   
  def _GenerateStatsTable(self, feed_merger):
  """Generate an HTML table of merge statistics.
   
  Args:
  feed_merger: The FeedMerger instance.
   
  Returns:
  The generated HTML as a string.
  """
  rows = []
  rows.append('<tr><th class="header"/><th class="header">Merged</th>'
  '<th class="header">Copied from old feed</th>'
  '<th class="header">Copied from new feed</th></tr>')
  for merger in feed_merger.GetMergerList():
  stats = merger.GetMergeStats()
  if stats is None:
  continue
  merged, not_merged_a, not_merged_b = stats
  rows.append('<tr><th class="header">%s</th>'
  '<td class="header">%d</td>'
  '<td class="header">%d</td>'
  '<td class="header">%d</td></tr>' %
  (merger.DATASET_NAME, merged, not_merged_a, not_merged_b))
  return '<table>%s</table>' % '\n'.join(rows)
   
  def _GenerateSection(self, problem_type):
  """Generate a listing of the given type of problems.
   
  Args:
  problem_type: The type of problem. This is one of the problem type
  constants from transitfeed.
   
  Returns:
  The generated HTML as a string.
  """
  if problem_type == transitfeed.TYPE_WARNING:
  dataset_problems = self._dataset_warnings
  heading = 'Warnings'
  else:
  dataset_problems = self._dataset_errors
  heading = 'Errors'
   
  if not dataset_problems:
  return ''
   
  prefix = '<h2 class="issueHeader">%s:</h2>' % heading
  dataset_sections = []
  for dataset_merger, problems in dataset_problems.items():
  dataset_sections.append('<h3>%s</h3><ol>%s</ol>' % (
  dataset_merger.FILE_NAME, '\n'.join(problems)))
  body = '\n'.join(dataset_sections)
  return prefix + body
   
  def _GenerateSummary(self):
  """Generate a summary of the warnings and errors.
   
  Returns:
  The generated HTML as a string.
  """
  items = []
  if self._dataset_errors:
  items.append('errors: %d' % self._error_count)
  if self._dataset_warnings:
  items.append('warnings: %d' % self._warning_count)
   
  if items:
  return '<p><span class="fail">%s</span></p>' % '<br>'.join(items)
  else:
  return '<p><span class="pass">feeds merged successfully</span></p>'
   
  def WriteOutput(self, output_file, feed_merger,
  old_feed_path, new_feed_path, merged_feed_path):
  """Write the HTML output to a file.
   
  Args:
  output_file: The file object that the HTML output will be written to.
  feed_merger: The FeedMerger instance.
  old_feed_path: The path to the old feed file as a string.
  new_feed_path: The path to the new feed file as a string
  merged_feed_path: The path to the merged feed file as a string. This
  may be None if no merged feed was written.
  """
  if merged_feed_path is None:
  html_merged_feed_path = ''
  else:
  html_merged_feed_path = '<p>Merged feed created: <code>%s</code></p>' % (
  merged_feed_path)
   
  html_header = """<html>
  <head>
  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
  <title>Feed Merger Results</title>
  <style>
  body {font-family: Georgia, serif; background-color: white}
  .path {color: gray}
  div.problem {max-width: 500px}
  td,th {background-color: khaki; padding: 2px; font-family:monospace}
  td.problem,th.problem {background-color: dc143c; color: white; padding: 2px;
  font-family:monospace}
  table {border-spacing: 5px 0px; margin-top: 3px}
  h3.issueHeader {padding-left: 1em}
  span.pass {background-color: lightgreen}
  span.fail {background-color: yellow}
  .pass, .fail {font-size: 16pt; padding: 3px}
  ol,.unused {padding-left: 40pt}
  .header {background-color: white; font-family: Georgia, serif; padding: 0px}
  th.header {text-align: right; font-weight: normal; color: gray}
  .footer {font-size: 10pt}
  </style>
  </head>
  <body>
  <h1>Feed merger results</h1>
  <p>Old feed: <code>%(old_feed_path)s</code></p>
  <p>New feed: <code>%(new_feed_path)s</code></p>
  %(html_merged_feed_path)s""" % locals()
   
  html_stats = self._GenerateStatsTable(feed_merger)
  html_summary = self._GenerateSummary()
  html_errors = self._GenerateSection(transitfeed.TYPE_ERROR)
  html_warnings = self._GenerateSection(transitfeed.TYPE_WARNING)
   
  html_footer = """
  <div class="footer">
  Generated using transitfeed version %s on %s.
  </div>
  </body>
  </html>""" % (transitfeed.__version__,
  time.strftime('%B %d, %Y at %I:%M %p %Z'))
   
  output_file.write(transitfeed.EncodeUnicode(html_header))
  output_file.write(transitfeed.EncodeUnicode(html_stats))
  output_file.write(transitfeed.EncodeUnicode(html_summary))
  output_file.write(transitfeed.EncodeUnicode(html_errors))
  output_file.write(transitfeed.EncodeUnicode(html_warnings))
  output_file.write(transitfeed.EncodeUnicode(html_footer))
   
   
  def LoadWithoutErrors(path, memory_db):
  """"Return a Schedule object loaded from path; sys.exit for any error."""
  accumulator = transitfeed.ExceptionProblemAccumulator()
  loading_problem_handler = MergeProblemReporter(accumulator)
  try:
  schedule = transitfeed.Loader(path,
  memory_db=memory_db,
  problems=loading_problem_handler).Load()
  except transitfeed.ExceptionWithContext, e:
  print >>sys.stderr, (
  "\n\nFeeds to merge must load without any errors.\n"
  "While loading %s the following error was found:\n%s\n%s\n" %
  (path, e.FormatContext(), transitfeed.EncodeUnicode(e.FormatProblem())))
  sys.exit(1)
  return schedule
   
   
  class DataSetMerger(object):
  """A DataSetMerger is in charge of merging a set of entities.
   
  This is an abstract class and should be subclassed for each different entity
  type.
   
  Attributes:
  ENTITY_TYPE_NAME: The name of the entity type like 'agency' or 'stop'.
  FILE_NAME: The name of the file containing this data set like 'agency.txt'.
  DATASET_NAME: A name for the dataset like 'Agencies' or 'Stops'.
  """
   
  def __init__(self, feed_merger):
  """Initialise.
   
  Args:
  feed_merger: The FeedMerger.
  """
  self.feed_merger = feed_merger
  self._num_merged = 0
  self._num_not_merged_a = 0
  self._num_not_merged_b = 0
   
  def _MergeIdentical(self, a, b):
  """Tries to merge two values. The values are required to be identical.
   
  Args:
  a: The first value.