Add some friendly name/duplicate subsitution rules
[bus.git] / maxious-canberra-transit-feed / extracttimes.rb
blob:a/maxious-canberra-transit-feed/extracttimes.rb -> blob:b/maxious-canberra-transit-feed/extracttimes.rb
require 'rubygems' require 'rubygems'
require 'nokogiri' require 'nokogiri'
require 'open-uri' require 'open-uri'
require 'pp' require 'pp'
   
def makeTimetable(table, period, short_name) def makeTimetable(table, period, short_name)
timetable = {"stop_times" => [], "between_stops" => [], "short_name" => short_name} timetable = {"stop_times" => [], "between_stops" => [], "short_name" => short_name}
time_points = table.xpath('tr[1]//th').map do |tp| time_points = table.xpath('tr[1]//th').map do |tp|
if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>" if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>"
timing_point = tp.content.squeeze(" ").gsub("\r\n Platform"," - Platform").strip timing_point = tp.content.squeeze(" ").gsub("\r\n Platform"," - Platform").gsub(" - "," - ").gsub("\n","").strip
end end
end end
time_points.delete(nil) time_points.delete(nil)
timetable["time_points"] = time_points timetable["time_points"] = time_points
timetable["long_name"] = "To " + time_points.last timetable["long_name"] = "To " + time_points.last
periodtimes = [] periodtimes = []
table.css('tr').each do |row| table.css('tr').each do |row|
times = row.css('td').map do |cell| times = row.css('td').map do |cell|
#TODO convert to GTFS time #TODO convert to GTFS time ie. replace " AM" with a
time = cell.content.squeeze(" ").strip time = cell.content.squeeze(" ").strip
end end
if not times.empty? if not times.empty?
if not (route = times.shift) if not (route = times.shift)
raise("TODO: account for shifting route numbers eg. intertown/redex 62/162") raise("TODO: account for shifting route numbers eg. intertown/redex 62/162")
end end
periodtimes << times periodtimes << times
end end
end end
if periodtimes.size < 1 if periodtimes.size < 1
raise "No times for route " + short_name + " in period " + period raise "No times for route " + short_name + " in period " + period
end end
timetable["stop_times"] = { period => periodtimes } timetable["stop_times"] = { period => periodtimes }
# pp timetable # pp timetable
filename = timetable["short_name"] + "-" + timetable["long_name"].downcase.gsub(" ","-").gsub("/","") + "." + period + ".yml" filename = timetable["short_name"] + "-" + timetable["long_name"].downcase.gsub(" ","-").gsub("/","") + "." + period + ".yml"
puts "Saving " + filename puts "Saving " + filename
File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f| File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f|
f.write timetable.to_yaml f.write timetable.to_yaml
end end
timetable timetable
end end
   
#TODO fix route 934 #TODO fix route 934
Dir.glob("source-html/Route*.htm*") { |file| Dir.glob("source-html/Route*.htm*") { |file|
puts "Opened " + file puts "Opened " + file
doc = Nokogiri::HTML(open(file)) doc = Nokogiri::HTML(open(file))
# Search for nodes by css # Search for nodes by css
timetables = [] timetables = []
short_name = ""; short_name = "";
doc.xpath('//title').each do |title| doc.xpath('//title').each do |title|
short_name = title.content.gsub("Route_","").gsub("Route ","").squeeze(" ").strip short_name = title.content.gsub("Route_","").gsub("Route ","").squeeze(" ").strip
end end
if short_name == "" if short_name == ""
raise "Route number(s) not found in <title> tag" raise "Route number(s) not found in <title> tag"
end end
   
doc.xpath('//table[preceding::text()="Weekdays"]').each do |table| doc.xpath('//table[preceding::text()="Weekdays"]').each do |table|
timetables << makeTimetable(table, "weekday", short_name) timetables << makeTimetable(table, "weekday", short_name)
end end
   
#weekends #weekends
doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table| doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table|
timetables << makeTimetable(table, "saturday", short_name) timetables << makeTimetable(table, "saturday", short_name)
end end
doc.xpath('//table[preceding::text()="Sundays"]').each do |table| doc.xpath('//table[preceding::text()="Sundays"]').each do |table|
timetables << makeTimetable(table, "sunday", short_name) timetables << makeTimetable(table, "sunday", short_name)
end end
#930/934 special cases #930/934 special cases
doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table| doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table|
timetables << makeTimetable(table, "saturday", short_name) timetables << makeTimetable(table, "saturday", short_name)
end end
doc.xpath('//table[preceding::text()="Sunday"]').each do |table| doc.xpath('//table[preceding::text()="Sunday"]').each do |table|
timetables << makeTimetable(table, "sunday", short_name) timetables << makeTimetable(table, "sunday", short_name)
end end
#route 81 = Weekdays - School Holidays Only #route 81 = Weekdays - School Holidays Only
doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table| doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table|
timetable = makeTimetable(table, "weekday", short_name) timetable = makeTimetable(table, "weekday", short_name)
#TODO set active date range to only be holidays #TODO set active date range to only be holidays
timetables << timetable; timetables << timetable;
end end
   
if timetables.size > 2 if timetables.size > 2
puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s
end end
if timetables.size < 2 if timetables.size < 2
puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s
elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty? elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty?
puts "WARNING: first pair of timetable timing points are not complementary for "+ file puts "WARNING: first pair of timetable timing points are not complementary for "+ file
pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse) pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse)
end end
if timetables.size < 1 if timetables.size < 1
raise "No timetables extracted from " + file raise "No timetables extracted from " + file
end end
} }