require 'rubygems' |
require 'rubygems' |
require 'nokogiri' |
require 'nokogiri' |
require 'open-uri' |
require 'open-uri' |
require 'pp' |
require 'pp' |
require 'yaml' |
require 'yaml' |
class Array |
class Array |
def to_yaml_style |
def to_yaml_style |
:inline |
:inline |
end |
end |
end |
end |
|
|
|
|
def makeTimetable(table, period, short_name) |
def makeTimetable(table, period, short_name) |
timetable = {"between_stops" => [], "short_name" => short_name} |
timetable = {"between_stops" => [], "short_name" => short_name} |
time_points = table.xpath('tr[1]//th').map do |tp| |
time_points = table.xpath('tr[1]//th').map do |tp| |
if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>" |
if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>" |
timing_point = tp.content.squeeze(" ").gsub("Bus Station"," Bus Station ").gsub(" Platform"," (Platform").gsub(" - "," - ").gsub("\n"," ").gsub("\r"," ").gsub("\t"," ").gsub("\\"," / ").gsub("/"," / ").gsub(","," ").gsub("\302\240","").squeeze(" ").strip |
timing_point = tp.content.squeeze(" ").gsub("Shops"," ").gsub("Bus Station"," Bus Station ").gsub("Interchange"," Bus Station ").gsub(" Platform"," (Platform") |
|
timing_point = timing_point.gsub("Machonochie","Maconochie").gsub("Hume"," ").gsub("Market Place","Marketplace").gsub("Terminus Fyshwick","Terminus") |
|
timing_point = timing_point.gsub(" - "," - ").gsub("\n"," ").gsub("\r"," ").gsub("\t"," ").gsub("\\"," / ").gsub("/"," / ").gsub(","," ").gsub("\302\240","").squeeze(" ").strip |
if (tp.content.match('Platform')) |
if (tp.content.match('Platform')) |
timing_point.concat(")") |
timing_point.concat(")") |
end; |
end; |
if tp.to_s.match(/[0-9][0-9][0-9]/) or tp.to_s.include? "Wheelchair" |
if tp.to_s.match(/[0-9][0-9][0-9]/) or tp.to_s.include? "Wheelchair" |
timing_point = nil |
timing_point = nil |
end |
end |
timing_point |
timing_point |
end |
end |
end |
end |
time_points.delete(nil) |
time_points.delete(nil) |
timetable["time_points"] = time_points.to_a |
timetable["time_points"] = time_points.to_a |
timetable["long_name"] = "To " + time_points.last |
timetable["long_name"] = "To " + time_points.last |
periodtimes = [] |
periodtimes = [] |
table.css('tr').each do |row| |
table.css('tr').each do |row| |
times = row.css('td').map do |cell| |
times = row.css('td').map do |cell| |
time = cell.content.squeeze(" ").strip |
time = cell.content.squeeze(" ").strip |
time = time.gsub(/ *A\S?M/,"a").gsub(/ ?P\S?M/,"p").gsub(/ *a\S?m/,"a").gsub(/ ?p\S?m/,"p") |
time = time.gsub(/ *A\S?M/,"a").gsub(/ ?P\S?M/,"p").gsub(/ *a\S?m/,"a").gsub(/ ?p\S?m/,"p") |
time = time.gsub("12:08 AM","1208x").gsub(":","").gsub("1.","1").gsub("2.","2").gsub("3.","3").gsub("4.","4") |
time = time.gsub("12:08 AM","1208x").gsub(":","").gsub("1.","1").gsub("2.","2").gsub("3.","3").gsub("4.","4") |
time = time.gsub("5.","5").gsub("6.","6").gsub("7.","7").gsub("8.","8").gsub("9.","9").gsub("10.","10") |
time = time.gsub("5.","5").gsub("6.","6").gsub("7.","7").gsub("8.","8").gsub("9.","9").gsub("10.","10") |
time = time.gsub("11.","11").gsub("12.","12").gsub(/\.+/,"-").gsub("\302\240","") |
time = time.gsub("11.","11").gsub("12.","12").gsub(/\.+/,"-").gsub("\302\240","") |
if time == "" or time.include? "chool" or time.include? "On Race Days" or time.include? "Bus" |
if time == "" or time.include? "chool" or time.include? "On Race Days" or time.include? "Bus" |
time = nil # This hacky way is faster than using position()>1 xpath on <TD>s! |
time = nil # This hacky way is faster than using position()>1 xpath on <TD>s! |
end |
end |
time |
time |
end |
end |
times.delete(nil) |
times.delete(nil) |
if not times.empty? |
if not times.empty? |
if not (route = times.shift) |
if not (route = times.shift) |
raise("TODO: account for shifting route numbers eg. intertown/redex 62/162") |
raise("TODO: account for shifting route numbers eg. intertown/redex 62/162") |
end |
end |
periodtimes << times.to_a |
periodtimes << times.to_a |
end |
end |
end |
end |
if periodtimes.size < 1 |
if periodtimes.size < 1 |
raise "No times for route " + short_name + " in period " + period |
raise "No times for route " + short_name + " in period " + period |
end |
end |
timetable[period] = periodtimes.to_a |
timetable[period] = periodtimes.to_a |
# pp timetable |
# pp timetable |
filename = timetable["short_name"] + "-" + timetable["long_name"]+ "." + period + ".yml" |
filename = timetable["short_name"] + "-" + timetable["long_name"]+ "." + period + ".yml" |
filename = filename.downcase.gsub(" ","-").gsub("/","-").gsub("(","").gsub(")","") |
filename = filename.downcase.gsub(" ","-").gsub("/","-").gsub("(","").gsub(")","") |
puts "Saving " + filename |
puts "Saving " + filename |
File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f| |
File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f| |
f.write timetable.to_yaml |
f.write timetable.to_yaml |
end |
end |
timetable |
timetable |
end |
end |
|
|
Dir.glob("source-html/*oute*.htm*") { |file| |
Dir.glob("source-html/*oute*.htm*") { |file| |
puts "Opened " + file |
puts "Opened " + file |
doc = Nokogiri::HTML(open(file)) |
doc = Nokogiri::HTML(open(file)) |
# Search for nodes by css |
# Search for nodes by css |
timetables = [] |
timetables = [] |
short_name = ""; |
short_name = ""; |
doc.xpath('//title').each do |title| |
doc.xpath('//title').each do |title| |
short_name = title.content.gsub("Route_","").gsub("Route ","").gsub("route ","").gsub(", ","/").gsub("ACTION Buses Timetable for ","").squeeze(" ").strip |
short_name = title.content.gsub("Route_","").gsub("Route ","").gsub("route ","").gsub(", ","/").gsub("ACTION Buses Timetable for ","").squeeze(" ").strip |
end |
end |
if short_name == "" |
if short_name == "" |
raise "Route number(s) not found in <title> tag" |
raise "Route number(s) not found in <title> tag" |
end |
end |
|
|
doc.xpath('//table[preceding::text()="Weekdays"]').each do |table| |
doc.xpath('//table[preceding::text()="Weekdays"]').each do |table| |
timetables << makeTimetable(table, "stop_times", short_name) |
timetables << makeTimetable(table, "stop_times", short_name) |
end |
end |
doc.xpath('//table[preceding::text()="This timetable is effective from Monday 15th November 2010."]').each do |table| |
doc.xpath('//table[preceding::text()="This timetable is effective from Monday 15th November 2010."]').each do |table| |
timetables << makeTimetable(table, "stop_times", short_name) |
if short_name[0].chr != "9" or short_name.size == 1 |
|
timetables << makeTimetable(table, "stop_times", short_name) |
|
end |
end |
end |
#all tables are weekdays on some really malformatted timetables |
#all tables are weekdays on some really malformatted timetables |
if short_name == "170" |
if short_name == "170" |
doc.xpath('//table').each do |table| |
doc.xpath('//table').each do |table| |
timetables << makeTimetable(table, "stop_times", short_name) |
timetables << makeTimetable(table, "stop_times", short_name) |
end |
end |
end |
end |
#weekends |
#weekends |
doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table| |
doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table| |
timetables << makeTimetable(table, "stop_times_saturday", short_name) |
timetables << makeTimetable(table, "stop_times_saturday", short_name) |
end |
end |
doc.xpath('//table[preceding::text()="Sundays"]').each do |table| |
doc.xpath('//table[preceding::text()="Sundays"]').each do |table| |
timetables << makeTimetable(table, "stop_times_sunday", short_name) |
timetables << makeTimetable(table, "stop_times_sunday", short_name) |
end |
end |
#930/934 special cases |
#930/934 special cases |
doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table| |
doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table| |
timetables << makeTimetable(table, "stop_times_saturday", short_name) |
timetables << makeTimetable(table, "stop_times_saturday", short_name) |
end |
end |
doc.xpath('//table[preceding::text()="Sunday"]').each do |table| |
doc.xpath('//table[preceding::text()="Sunday"]').each do |table| |
timetables << makeTimetable(table, "stop_times_sunday", short_name) |
timetables << makeTimetable(table, "stop_times_sunday", short_name) |
end |
end |
#route 81 = Weekdays - School Holidays Only |
#route 81 = Weekdays - School Holidays Only |
doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table| |
doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table| |
timetable = makeTimetable(table, "stop_times", short_name) |
timetable = makeTimetable(table, "stop_times", short_name) |
#TODO set active date range to only be holidays |
#TODO set active date range to only be holidays |
timetables << timetable; |
timetables << timetable; |
end |
end |
|
|
|
|
if timetables.size > 2 |
if timetables.size > 2 |
puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s |
puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s |
end |
end |
if timetables.size < 2 |
if timetables.size < 2 |
puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s |
puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s |
elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty? |
elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty? |
puts "WARNING: first pair of timetable timing points are not complementary for "+ file |
puts "WARNING: first pair of timetable timing points are not complementary for "+ file |
pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse) |
pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse) |
end |
end |
if timetables.size < 1 |
if timetables.size < 1 |
raise "No timetables extracted from " + file |
raise "No timetables extracted from " + file |
end |
end |
} |
} |
|
|