1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | require 'rubygems' require 'nokogiri' require 'open-uri' require 'pp' require 'yaml' class Array def to_yaml_style :inline end end def makeTimetable(table, period, short_name) timetable = {"between_stops" => [], "short_name" => short_name} time_points = table.xpath('tr[1]//th').map do |tp| if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>" timing_point = tp.content.squeeze(" ").gsub("Bus Station"," Bus Station ").gsub(" Platform"," (Platform").gsub(" - "," - ").gsub("\n"," ").gsub("\r"," ").gsub("\t"," ").gsub("\\"," / ").gsub("/"," / ").gsub(",",", ").gsub("\302\240","").squeeze(" ").strip if (tp.content.match('Platform')) timing_point.concat(")") end; timing_point end end time_points.delete(nil) time_points.delete("WheelchairAccessible") time_points.delete("Wheelchair Accessible") timetable["time_points"] = time_points.to_a timetable["long_name"] = "To " + time_points.last periodtimes = [] table.css('tr').each do |row| times = row.css('td').map do |cell| time = cell.content.squeeze(" ").strip time = time.gsub(/ *A\S?M/,"a").gsub(/ ?P\S?M/,"p").gsub("12:08 AM","1208x").gsub(":","").gsub("1.","1").gsub("2.","2") time = time.gsub("3.","3").gsub("4.","4") time = time.gsub("5.","5").gsub("6.","6").gsub("7.","7").gsub("8.","8").gsub("9.","9").gsub("10.","10") time = time.gsub("11.","11").gsub("12.","12").gsub(/\.+/,"-").gsub("\302\240","") if time == "" then time = nil end time end times.delete(nil) if not times.empty? if not (route = times.shift) raise("TODO: account for shifting route numbers eg. intertown/redex 62/162") end periodtimes << times.to_a end end if periodtimes.size < 1 raise "No times for route " + short_name + " in period " + period end timetable[period] = periodtimes.to_a # pp timetable filename = timetable["short_name"] + "-" + timetable["long_name"]+ "." + period + ".yml" filename = filename.downcase.gsub(" ","-").gsub("/","-").gsub("(","").gsub(")","") puts "Saving " + filename File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f| f.write timetable.to_yaml end timetable end Dir.glob("source-html/Route*.htm*") { |file| puts "Opened " + file doc = Nokogiri::HTML(open(file)) # Search for nodes by css timetables = [] short_name = ""; doc.xpath('//title').each do |title| short_name = title.content.gsub("Route_","").gsub("Route ","").gsub(", ","/").gsub("ACTION Buses Timetable for ","").squeeze(" ").strip end if short_name == "" raise "Route number(s) not found in <title> tag" end doc.xpath('//table[preceding::text()="Weekdays"]').each do |table| timetables << makeTimetable(table, "stop_times", short_name) end doc.xpath('//table[preceding::text()="This timetable is effective from Monday 15th November 2010."]').each do |table| timetables << makeTimetable(table, "stop_times", short_name) end #all tables are weekdays on some really malformatted timetables if short_name == "170" doc.xpath('//table').each do |table| timetables << makeTimetable(table, "stop_times", short_name) end end #weekends doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table| timetables << makeTimetable(table, "stop_times_saturday", short_name) end doc.xpath('//table[preceding::text()="Sundays"]').each do |table| timetables << makeTimetable(table, "stop_times_sunday", short_name) end #930/934 special cases doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table| timetables << makeTimetable(table, "stop_times_saturday", short_name) end doc.xpath('//table[preceding::text()="Sunday"]').each do |table| timetables << makeTimetable(table, "stop_times_sunday", short_name) end #route 81 = Weekdays - School Holidays Only doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table| timetable = makeTimetable(table, "stop_times", short_name) #TODO set active date range to only be holidays timetables << timetable; end if timetables.size > 2 puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s end if timetables.size < 2 puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty? puts "WARNING: first pair of timetable timing points are not complementary for "+ file pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse) end if timetables.size < 1 raise "No timetables extracted from " + file end } |