Upgrade origin-src to google transit feed 1.2.6
[bus.git] / origin-src / transitfeed-1.2.6 / merge.py
blob:a/origin-src/transitfeed-1.2.6/merge.py -> blob:b/origin-src/transitfeed-1.2.6/merge.py
--- a/origin-src/transitfeed-1.2.6/merge.py
+++ b/origin-src/transitfeed-1.2.6/merge.py
@@ -1,1 +1,1830 @@
-
+#!/usr/bin/python2.5
+#
+# Copyright 2007 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A tool for merging two Google Transit feeds.
+
+Given two Google Transit feeds intending to cover two disjoint calendar
+intervals, this tool will attempt to produce a single feed by merging as much
+of the two feeds together as possible.
+
+For example, most stops remain the same throughout the year. Therefore, many
+of the stops given in stops.txt for the first feed represent the same stops
+given in the second feed. This tool will try to merge these stops so they
+only appear once in the resultant feed.
+
+A note on terminology: The first schedule is referred to as the "old" schedule;
+the second as the "new" schedule. The resultant schedule is referred to as
+the "merged" schedule. Names of things in the old schedule are variations of
+the letter "a" while names of things from the new schedule are variations of
+"b". The objects that represents routes, agencies and so on are called
+"entities".
+
+usage: merge.py [options] old_feed_path new_feed_path merged_feed_path
+
+Run merge.py --help for a list of the possible options.
+"""
+
+
+__author__ = 'timothy.stranex@gmail.com (Timothy Stranex)'
+
+
+import datetime
+import optparse
+import os
+import re
+import sys
+import time
+import transitfeed
+from transitfeed import util
+import webbrowser
+
+
+# TODO:
+# 1. write unit tests that use actual data
+# 2. write a proper trip and stop_times merger
+# 3. add a serialised access method for stop_times and shapes to transitfeed
+# 4. add support for merging schedules which have some service period overlap
+
+
+def ApproximateDistanceBetweenPoints(pa, pb):
+  """Finds the distance between two points on the Earth's surface.
+
+  This is an approximate distance based on assuming that the Earth is a sphere.
+  The points are specified by their lattitude and longitude.
+
+  Args:
+    pa: the first (lat, lon) point tuple
+    pb: the second (lat, lon) point tuple
+
+  Returns:
+    The distance as a float in metres.
+  """
+  alat, alon = pa
+  blat, blon = pb
+  sa = transitfeed.Stop(lat=alat, lng=alon)
+  sb = transitfeed.Stop(lat=blat, lng=blon)
+  return transitfeed.ApproximateDistanceBetweenStops(sa, sb)
+
+
+class Error(Exception):
+  """The base exception class for this module."""
+
+
+class MergeError(Error):
+  """An error produced when two entities could not be merged."""
+
+
+class MergeProblemWithContext(transitfeed.ExceptionWithContext):
+  """The base exception class for problem reporting in the merge module.
+
+  Attributes:
+    dataset_merger: The DataSetMerger that generated this problem.
+    entity_type_name: The entity type of the dataset_merger. This is just
+                      dataset_merger.ENTITY_TYPE_NAME.
+    ERROR_TEXT: The text used for generating the problem message.
+  """
+
+  def __init__(self, dataset_merger, problem_type=transitfeed.TYPE_WARNING,
+               **kwargs):
+    """Initialise the exception object.
+
+    Args:
+      dataset_merger: The DataSetMerger instance that generated this problem.
+      problem_type: The problem severity. This should be set to one of the
+                    corresponding constants in transitfeed.
+      kwargs: Keyword arguments to be saved as instance attributes.
+    """
+    kwargs['type'] = problem_type
+    kwargs['entity_type_name'] = dataset_merger.ENTITY_TYPE_NAME
+    transitfeed.ExceptionWithContext.__init__(self, None, None, **kwargs)
+    self.dataset_merger = dataset_merger
+
+  def FormatContext(self):
+    return "In files '%s'" % self.dataset_merger.FILE_NAME
+
+
+class SameIdButNotMerged(MergeProblemWithContext):
+  ERROR_TEXT = ("There is a %(entity_type_name)s in the old feed with id "
+                "'%(id)s' and one from the new feed with the same id but "
+                "they could not be merged:")
+
+
+class CalendarsNotDisjoint(MergeProblemWithContext):
+  ERROR_TEXT = ("The service periods could not be merged since they are not "
+                "disjoint.")
+
+
+class MergeNotImplemented(MergeProblemWithContext):
+  ERROR_TEXT = ("The feed merger does not currently support merging in this "
+                "file. The entries have been duplicated instead.")
+
+
+class FareRulesBroken(MergeProblemWithContext):
+  ERROR_TEXT = ("The feed merger is currently unable to handle fare rules "
+                "properly.")
+
+
+class MergeProblemReporter(transitfeed.ProblemReporter):
+  """The base problem reporter class for the merge module."""
+
+  def __init__(self, accumulator):
+    transitfeed.ProblemReporter.__init__(self, accumulator)
+
+  def SameIdButNotMerged(self, dataset, entity_id, reason):
+    self.AddToAccumulator(
+        SameIdButNotMerged(dataset, id=entity_id, reason=reason))
+
+  def CalendarsNotDisjoint(self, dataset):
+    self.AddToAccumulator(
+        CalendarsNotDisjoint(dataset, problem_type=transitfeed.TYPE_ERROR))
+
+  def MergeNotImplemented(self, dataset):
+    self.AddToAccumulator(MergeNotImplemented(dataset))
+
+  def FareRulesBroken(self, dataset):
+    self.AddToAccumulator(FareRulesBroken(dataset))
+
+
+class HTMLProblemAccumulator(transitfeed.ProblemAccumulatorInterface):
+  """A problem reporter which generates HTML output."""
+
+  def __init__(self):
+    """Initialise."""
+    self._dataset_warnings = {}  # a map from DataSetMergers to their warnings
+    self._dataset_errors = {}
+    self._warning_count = 0
+    self._error_count = 0
+
+  def _Report(self, merge_problem):
+    if merge_problem.IsWarning():
+      dataset_problems = self._dataset_warnings
+      self._warning_count += 1
+    else:
+      dataset_problems = self._dataset_errors
+      self._error_count += 1
+
+    problem_html = '<li>%s</li>' % (
+        merge_problem.FormatProblem().replace('\n', '<br>'))
+    dataset_problems.setdefault(merge_problem.dataset_merger, []).append(
+        problem_html)
+
+  def _GenerateStatsTable(self, feed_merger):
+    """Generate an HTML table of merge statistics.
+
+    Args:
+      feed_merger: The FeedMerger instance.
+
+    Returns:
+      The generated HTML as a string.
+    """
+    rows = []
+    rows.append('<tr><th class="header"/><th class="header">Merged</th>'
+                '<th class="header">Copied from old feed</th>'
+                '<th class="header">Copied from new feed</th></tr>')
+    for merger in feed_merger.GetMergerList():
+      stats = merger.GetMergeStats()
+      if stats is None:
+        continue
+      merged, not_merged_a, not_merged_b = stats
+      rows.append('<tr><th class="header">%s</th>'
+                  '<td class="header">%d</td>'
+                  '<td class="header">%d</td>'
+                  '<td class="header">%d</td></tr>' %
+                  (merger.DATASET_NAME, merged, not_merged_a, not_merged_b))
+    return '<table>%s</table>' % '\n'.join(rows)
+
+  def _GenerateSection(self, problem_type):
+    """Generate a listing of the given type of problems.
+
+    Args:
+      problem_type: The type of problem. This is one of the problem type
+                    constants from transitfeed.
+
+    Returns:
+      The generated HTML as a string.
+    """
+    if problem_type == transitfeed.TYPE_WARNING:
+      dataset_problems = self._dataset_warnings
+      heading = 'Warnings'
+    else:
+      dataset_problems = self._dataset_errors
+      heading = 'Errors'
+
+    if not dataset_problems:
+      return ''
+
+    prefix = '<h2 class="issueHeader">%s:</h2>' % heading
+    dataset_sections = []
+    for dataset_merger, problems in dataset_problems.items():
+      dataset_sections.append('<h3>%s</h3><ol>%s</ol>' % (
+          dataset_merger.FILE_NAME, '\n'.join(problems)))
+    body = '\n'.join(dataset_sections)
+    return prefix + body
+
+  def _GenerateSummary(self):
+    """Generate a summary of the warnings and errors.
+
+    Returns:
+      The generated HTML as a string.
+    """
+    items = []
+    if self._dataset_errors:
+      items.append('errors: %d' % self._error_count)
+    if self._dataset_warnings:
+      items.append('warnings: %d' % self._warning_count)
+
+    if items:
+      return '<p><span class="fail">%s</span></p>' % '<br>'.join(items)
+    else:
+      return '<p><span class="pass">feeds merged successfully</span></p>'
+
+  def WriteOutput(self, output_file, feed_merger,
+                  old_feed_path, new_feed_path, merged_feed_path):
+    """Write the HTML output to a file.
+
+    Args:
+      output_file: The file object that the HTML output will be written to.
+      feed_merger: The FeedMerger instance.
+      old_feed_path: The path to the old feed file as a string.
+      new_feed_path: The path to the new feed file as a string
+      merged_feed_path: The path to the merged feed file as a string. This
+                        may be None if no merged feed was written.
+    """
+    if merged_feed_path is None:
+      html_merged_feed_path = ''
+    else:
+      html_merged_feed_path = '<p>Merged feed created: <code>%s</code></p>' % (
+          merged_feed_path)
+
+    html_header = """<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+<title>Feed Merger Results</title>
+<style>
+  body {font-family: Georgia, serif; background-color: white}
+  .path {color: gray}
+  div.problem {max-width: 500px}
+  td,th {background-color: khaki; padding: 2px; font-family:monospace}
+  td.problem,th.problem {background-color: dc143c; color: white; padding: 2px;
+                         font-family:monospace}
+  table {border-spacing: 5px 0px; margin-top: 3px}
+  h3.issueHeader {padding-left: 1em}
+  span.pass {background-color: lightgreen}
+  span.fail {background-color: yellow}
+  .pass, .fail {font-size: 16pt; padding: 3px}
+  ol,.unused {padding-left: 40pt}
+  .header {background-color: white; font-family: Georgia, serif; padding: 0px}
+  th.header {text-align: right; font-weight: normal; color: gray}
+  .footer {font-size: 10pt}
+</style>
+</head>
+<body>
+<h1>Feed merger results</h1>
+<p>Old feed: <code>%(old_feed_path)s</code></p>
+<p>New feed: <code>%(new_feed_path)s</code></p>
+%(html_merged_feed_path)s""" % locals()
+
+    html_stats = self._GenerateStatsTable(feed_merger)
+    html_summary = self._GenerateSummary()
+    html_errors = self._GenerateSection(transitfeed.TYPE_ERROR)
+    html_warnings = self._GenerateSection(transitfeed.TYPE_WARNING)
+
+    html_footer = """
+<div class="footer">
+Generated using transitfeed version %s on %s.
+</div>
+</body>
+</html>""" % (transitfeed.__version__,
+              time.strftime('%B %d, %Y at %I:%M %p %Z'))
+
+    output_file.write(transitfeed.EncodeUnicode(html_header))
+    output_file.write(transitfeed.EncodeUnicode(html_stats))
+    output_file.write(transitfeed.EncodeUnicode(html_summary))
+    output_file.write(transitfeed.EncodeUnicode(html_errors))
+    output_file.write(transitfeed.EncodeUnicode(html_warnings))
+    output_file.write(transitfeed.EncodeUnicode(html_footer))
+
+
+def LoadWithoutErrors(path, memory_db):
+  """"Return a Schedule object loaded from path; sys.exit for any error."""
+  accumulator = transitfeed.ExceptionProblemAccumulator()
+  loading_problem_handler = MergeProblemReporter(accumulator)
+  try:
+    schedule = transitfeed.Loader(path,
+                                  memory_db=memory_db,
+                                  problems=loading_problem_handler).Load()
+  except transitfeed.ExceptionWithContext, e:
+    print >>sys.stderr, (
+        "\n\nFeeds to merge must load without any errors.\n"
+        "While loading %s the following error was found:\n%s\n%s\n" %
+        (path, e.FormatContext(), transitfeed.EncodeUnicode(e.FormatProblem())))
+    sys.exit(1)
+  return schedule
+
+
+class DataSetMerger(object):
+  """A DataSetMerger is in charge of merging a set of entities.
+
+  This is an abstract class and should be subclassed for each different entity
+  type.
+
+  Attributes:
+    ENTITY_TYPE_NAME: The name of the entity type like 'agency' or 'stop'.
+    FILE_NAME: The name of the file containing this data set like 'agency.txt'.
+    DATASET_NAME: A name for the dataset like 'Agencies' or 'Stops'.
+  """
+
+  def __init__(self, feed_merger):
+    """Initialise.
+
+    Args:
+      feed_merger: The FeedMerger.
+    """
+    self.feed_merger = feed_merger
+    self._num_merged = 0
+    self._num_not_merged_a = 0
+    self._num_not_merged_b = 0
+
+  def _MergeIdentical(self, a, b):
+    """Tries to merge two values. The values are required to be identical.
+
+    Args:
+      a: The first value.
+      b: The second value.
+
+    Returns:
+      The trivially merged value.
+
+    Raises:
+      MergeError: The values were not identical.
+    """
+    if a != b:
+      raise MergeError("values must be identical ('%s' vs '%s')" %
+                       (transitfeed.EncodeUnicode(a),
+                        transitfeed.EncodeUnicode(b)))
+    return b
+
+  def _MergeIdenticalCaseInsensitive(self, a, b):
+    """Tries to merge two strings.
+
+    The string are required to be the same ignoring case. The second string is
+    always used as the merged value.
+
+    Args:
+      a: The first string.
+      b: The second string.
+
+    Returns:
+      The merged string. This is equal to the second string.
+
+    Raises:
+      MergeError: The strings were not the same ignoring case.
+    """
+    if a.lower() != b.lower():
+      raise MergeError("values must be the same (case insensitive) "
+                       "('%s' vs '%s')" % (transitfeed.EncodeUnicode(a),
+                                           transitfeed.EncodeUnicode(b)))
+    return b
+
+  def _MergeOptional(self, a, b):
+    """Tries to merge two values which may be None.
+
+    If both values are not None, they are required to be the same and the
+    merge is trivial. If one of the values is None and the other is not None,
+    the merge results in the one which is not None. If both are None, the merge
+    results in None.
+
+    Args:
+      a: The first value.
+      b: The second value.
+
+    Returns:
+      The merged value.
+
+    Raises:
+      MergeError: If both values are not None and are not the same.
+    """
+    if a and b:
+      if a != b:
+        raise MergeError("values must be identical if both specified "
+                         "('%s' vs '%s')" % (transitfeed.EncodeUnicode(a),
+                                             transitfeed.EncodeUnicode(b)))
+    return a or b
+
+  def _MergeSameAgency(self, a_agency_id, b_agency_id):
+    """Merge agency ids to the corresponding agency id in the merged schedule.
+
+    Args:
+      a_agency_id: an agency id from the old schedule
+      b_agency_id: an agency id from the new schedule
+
+    Returns:
+      The agency id of the corresponding merged agency.
+
+    Raises:
+      MergeError: If a_agency_id and b_agency_id do not correspond to the same
+                  merged agency.
+      KeyError: Either aaid or baid is not a valid agency id.
+    """
+    a_agency_id = (a_agency_id or
+                   self.feed_merger.a_schedule.GetDefaultAgency().agency_id)
+    b_agency_id = (b_agency_id or
+                   self.feed_merger.b_schedule.GetDefaultAgency().agency_id)
+    a_agency = self.feed_merger.a_schedule.GetAgency(
+        a_agency_id)._migrated_entity
+    b_agency = self.feed_merger.b_schedule.GetAgency(
+        b_agency_id)._migrated_entity
+    if a_agency != b_agency:
+      raise MergeError('agency must be the same')
+    return a_agency.agency_id
+
+  def _SchemedMerge(self, scheme, a, b):
+    """Tries to merge two entities according to a merge scheme.
+
+    A scheme is specified by a map where the keys are entity attributes and the
+    values are merge functions like Merger._MergeIdentical or
+    Merger._MergeOptional. The entity is first migrated to the merged schedule.
+    Then the attributes are individually merged as specified by the scheme.
+
+    Args:
+      scheme: The merge scheme, a map from entity attributes to merge
+              functions.
+      a: The entity from the old schedule.
+      b: The entity from the new schedule.
+
+    Returns:
+      The migrated and merged entity.
+
+    Raises:
+      MergeError: One of the attributes was not able to be merged.
+    """
+    migrated = self._Migrate(b, self.feed_merger.b_schedule, False)
+    for attr, merger in scheme.items():
+      a_attr = getattr(a, attr, None)
+      b_attr = getattr(b, attr, None)
+      try:
+        merged_attr = merger(a_attr, b_attr)
+      except MergeError, merge_error:
+        raise MergeError("Attribute '%s' could not be merged: %s." % (
+            attr, merge_error))
+      setattr(migrated, attr, merged_attr)
+    return migrated
+
+  def _MergeSameId(self):
+    """Tries to merge entities based on their ids.
+
+    This tries to merge only the entities from the old and new schedules which
+    have the same id. These are added into the merged schedule. Entities which
+    do not merge or do not have the same id as another entity in the other
+    schedule are simply migrated into the merged schedule.
+
+    This method is less flexible than _MergeDifferentId since it only tries
+    to merge entities which have the same id while _MergeDifferentId tries to
+    merge everything. However, it is faster and so should be used whenever
+    possible.
+
+    This method makes use of various methods like _Merge and _Migrate which
+    are not implemented in the abstract DataSetMerger class. These method
+    should be overwritten in a subclass to allow _MergeSameId to work with
+    different entity types.
+
+    Returns:
+      The number of merged entities.
+    """
+    a_not_merged = []
+    b_not_merged = []
+
+    for a in self._GetIter(self.feed_merger.a_schedule):
+      try:
+        b = self._GetById(self.feed_merger.b_schedule, self._GetId(a))
+      except KeyError:
+        # there was no entity in B with the same id as a
+        a_not_merged.append(a)
+        continue
+      try:
+        self._Add(a, b, self._MergeEntities(a, b))
+        self._num_merged += 1
+      except MergeError, merge_error:
+        a_not_merged.append(a)
+        b_not_merged.append(b)
+        self._ReportSameIdButNotMerged(self._GetId(a), merge_error)
+
+    for b in self._GetIter(self.feed_merger.b_schedule):
+      try:
+        a = self._GetById(self.feed_merger.a_schedule, self._GetId(b))
+      except KeyError:
+        # there was no entity in A with the same id as b
+        b_not_merged.append(b)
+
+    # migrate the remaining entities
+    for a in a_not_merged:
+      newid = self._HasId(self.feed_merger.b_schedule, self._GetId(a))
+      self._Add(a, None, self._Migrate(a, self.feed_merger.a_schedule, newid))
+    for b in b_not_merged:
+      newid = self._HasId(self.feed_merger.a_schedule, self._GetId(b))
+      self._Add(None, b, self._Migrate(b, self.feed_merger.b_schedule, newid))
+
+    self._num_not_merged_a = len(a_not_merged)
+    self._num_not_merged_b = len(b_not_merged)
+    return self._num_merged
+
+  def _MergeByIdKeepNew(self):
+    """Migrate all entities, discarding duplicates from the old/a schedule.
+
+    This method migrates all entities from the new/b schedule. It then migrates
+    entities in the old schedule where there isn't already an entity with the
+    same ID.
+
+    Unlike _MergeSameId this method migrates entities to the merged schedule
+    before comparing their IDs. This allows transfers to be compared when they
+    refer to stops that had their ID updated by migration.
+
+    This method makes use of various methods like _Migrate and _Add which
+    are not implemented in the abstract DataSetMerger class. These methods
+    should be overwritten in a subclass to allow _MergeByIdKeepNew to work with
+    different entity types.
+
+    Returns:
+      The number of merged entities.
+    """
+    # Maps from migrated ID to tuple(original object, migrated object)
+    a_orig_migrated = {}
+    b_orig_migrated = {}
+
+    for orig in self._GetIter(self.feed_merger.a_schedule):
+      migrated = self._Migrate(orig, self.feed_merger.a_schedule)
+      a_orig_migrated[self._GetId(migrated)] = (orig, migrated)
+
+    for orig in self._GetIter(self.feed_merger.b_schedule):
+      migrated = self._Migrate(orig, self.feed_merger.b_schedule)
+      b_orig_migrated[self._GetId(migrated)] = (orig, migrated)
+
+    for migrated_id, (orig, migrated) in b_orig_migrated.items():
+      self._Add(None, orig, migrated)
+      self._num_not_merged_b += 1
+
+    for migrated_id, (orig, migrated) in a_orig_migrated.items():
+      if migrated_id not in b_orig_migrated:
+        self._Add(orig, None, migrated)
+        self._num_not_merged_a += 1
+    return self._num_merged
+
+  def _MergeDifferentId(self):
+    """Tries to merge all possible combinations of entities.
+
+    This tries to merge every entity in the old schedule with every entity in
+    the new schedule. Unlike _MergeSameId, the ids do not need to match.
+    However, _MergeDifferentId is much slower than _MergeSameId.
+
+    This method makes use of various methods like _Merge and _Migrate which
+    are not implemented in the abstract DataSetMerger class. These method
+    should be overwritten in a subclass to allow _MergeSameId to work with
+    different entity types.
+
+    Returns:
+      The number of merged entities.
+    """
+    # TODO: The same entity from A could merge with multiple from B.
+    # This should either generate an error or should be prevented from
+    # happening.
+    for a in self._GetIter(self.feed_merger.a_schedule):
+      for b in self._GetIter(self.feed_merger.b_schedule):
+        try:
+          self._Add(a, b, self._MergeEntities(a, b))
+          self._num_merged += 1
+        except MergeError:
+          continue
+
+    for a in self._GetIter(self.feed_merger.a_schedule):
+      if a not in self.feed_merger.a_merge_map:
+        self._num_not_merged_a += 1
+        newid = self._HasId(self.feed_merger.b_schedule, self._GetId(a))
+        self._Add(a, None,
+                  self._Migrate(a, self.feed_merger.a_schedule, newid))
+    for b in self._GetIter(self.feed_merger.b_schedule):
+      if b not in self.feed_merger.b_merge_map:
+        self._num_not_merged_b += 1
+        newid = self._HasId(self.feed_merger.a_schedule, self._GetId(b))
+        self._Add(None, b,
+                  self._Migrate(b, self.feed_merger.b_schedule, newid))
+
+    return self._num_merged
+
+  def _ReportSameIdButNotMerged(self, entity_id, reason):
+    """Report that two entities have the same id but could not be merged.
+
+    Args:
+      entity_id: The id of the entities.
+      reason: A string giving a reason why they could not be merged.
+    """
+    self.feed_merger.problem_reporter.SameIdButNotMerged(self,
+                                                         entity_id,
+                                                         reason)
+
+  def _GetIter(self, schedule):
+    """Returns an iterator of entities for this data set in the given schedule.
+
+    This method usually corresponds to one of the methods from
+    transitfeed.Schedule like GetAgencyList() or GetRouteList().
+
+    Note: This method must be overwritten in a subclass if _MergeSameId or
+    _MergeDifferentId are to be used.
+
+    Args:
+      schedule: Either the old or new schedule from the FeedMerger.
+
+    Returns:
+      An iterator of entities.
+    """
+    raise NotImplementedError()
+
+  def _GetById(self, schedule, entity_id):
+    """Returns an entity given its id.
+
+    This method usually corresponds to one of the methods from
+    transitfeed.Schedule like GetAgency() or GetRoute().
+
+    Note: This method must be overwritten in a subclass if _MergeSameId or
+    _MergeDifferentId are to be used.
+
+    Args:
+      schedule: Either the old or new schedule from the FeedMerger.
+      entity_id: The id string of the entity.
+
+    Returns:
+      The entity with the given id.
+
+    Raises:
+      KeyError: There is not entity with the given id.
+    """
+    raise NotImplementedError()
+
+  def _HasId(self, schedule, entity_id):
+    """Check if the schedule has an entity with the given id.
+
+    Args:
+      schedule: The transitfeed.Schedule instance to look in.
+      entity_id: The id of the entity.
+
+    Returns:
+      True if the schedule has an entity with the id or False if not.
+    """
+    try:
+      self._GetById(schedule, entity_id)
+      has = True
+    except KeyError:
+      has = False
+    return has
+
+  def _MergeEntities(self, a, b):
+    """Tries to merge the two entities.
+
+    Note: This method must be overwritten in a subclass if _MergeSameId or
+    _MergeDifferentId are to be used.
+
+    Args:
+      a: The entity from the old schedule.
+      b: The entity from the new schedule.
+
+    Returns:
+      The merged migrated entity.
+
+    Raises:
+      MergeError: The entities were not able to be merged.
+    """
+    raise NotImplementedError()
+
+  def _Migrate(self, entity, schedule, newid):
+    """Migrates the entity to the merge schedule.
+
+    This involves copying the entity and updating any ids to point to the
+    corresponding entities in the merged schedule. If newid is True then
+    a unique id is generated for the migrated entity using the original id
+    as a prefix.
+
+    Note: This method must be overwritten in a subclass if _MergeSameId or
+    _MergeDifferentId are to be used.
+
+    Args:
+      entity: The entity to migrate.
+      schedule: The schedule from the FeedMerger that contains ent.
+      newid: Whether to generate a new id (True) or keep the original (False).
+
+    Returns:
+      The migrated entity.
+    """
+    raise NotImplementedError()
+
+  def _Add(self, a, b, migrated):
+    """Adds the migrated entity to the merged schedule.
+
+    If a and b are both not None, it means that a and b were merged to create
+    migrated. If one of a or b is None, it means that the other was not merged
+    but has been migrated. This mapping is registered with the FeedMerger.
+
+    Note: This method must be overwritten in a subclass if _MergeSameId or
+    _MergeDifferentId are to be used.
+
+    Args:
+      a: The original entity from the old schedule.
+      b: The original entity from the new schedule.
+      migrated: The migrated entity for the merged schedule.
+    """
+    raise NotImplementedError()
+
+  def _GetId(self, entity):
+    """Returns the id of the given entity.
+
+    Note: This method must be overwritten in a subclass if _MergeSameId or
+    _MergeDifferentId are to be used.
+
+    Args:
+      entity: The entity.
+
+    Returns:
+      The id of the entity as a string or None.
+    """
+    raise NotImplementedError()
+
+  def MergeDataSets(self):
+    """Merge the data sets.
+
+    This method is called in FeedMerger.MergeSchedule().
+
+    Note: This method must be overwritten in a subclass.
+
+    Returns:
+      A boolean which is False if the dataset was unable to be merged and
+      as a result the entire merge should be aborted. In this case, the problem
+      will have been reported using the FeedMerger's problem reporter.
+    """
+    raise NotImplementedError()
+
+  def GetMergeStats(self):
+    """Returns some merge statistics.
+
+    These are given as a tuple (merged, not_merged_a, not_merged_b) where
+    "merged" is the number of merged entities, "not_merged_a" is the number of
+    entities from the old schedule that were not merged and "not_merged_b" is
+    the number of entities from the new schedule that were not merged.
+
+    The return value can also be None. This means that there are no statistics
+    for this entity type.
+
+    The statistics are only available after MergeDataSets() has been called.
+
+    Returns:
+      Either the statistics tuple or None.
+    """
+    return (self._num_merged, self._num_not_merged_a, self._num_not_merged_b)
+
+
+class AgencyMerger(DataSetMerger):
+  """A DataSetMerger for agencies."""
+
+  ENTITY_TYPE_NAME = 'agency'
+  FILE_NAME = 'agency.txt'
+  DATASET_NAME = 'Agencies'
+
+  def _GetIter(self, schedule):
+    return schedule.GetAgencyList()
+
+  def _GetById(self, schedule, agency_id):
+    return schedule.GetAgency(agency_id)
+
+  def _MergeEntities(self, a, b):
+    """Merges two agencies.
+
+    To be merged, they are required to have the same id, name, url and
+    timezone. The remaining language attribute is taken from the new agency.
+
+    Args:
+      a: The first agency.
+      b: The second agency.
+
+    Returns:
+      The merged agency.
+
+    Raises:
+      MergeError: The agencies could not be merged.
+    """
+
+    def _MergeAgencyId(a_agency_id, b_agency_id):
+      """Merge two agency ids.
+
+      The only difference between this and _MergeIdentical() is that the values
+      None and '' are regarded as being the same.
+
+      Args:
+        a_agency_id: The first agency id.
+        b_agency_id: The second agency id.
+
+      Returns:
+        The merged agency id.
+
+      Raises:
+        MergeError: The agency ids could not be merged.
+      """
+      a_agency_id = a_agency_id or None
+      b_agency_id = b_agency_id or None
+      return self._MergeIdentical(a_agency_id, b_agency_id)
+
+    scheme = {'agency_id': _MergeAgencyId,
+              'agency_name': self._MergeIdentical,
+              'agency_url': self._MergeIdentical,
+              'agency_timezone': self._MergeIdentical}
+    return self._SchemedMerge(scheme, a, b)
+
+  def _Migrate(self, entity, schedule, newid):
+    a = transitfeed.Agency(field_dict=entity)
+    if newid:
+      a.agency_id = self.feed_merger.GenerateId(entity.agency_id)