Network 10 changes
[bus.git] / maxious-canberra-transit-feed / 01-extracttimes.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'pp'
require 'yaml'
class Array
  def to_yaml_style
    :inline
  end
end
 
 
def makeTimetable(table, period, short_name)
        timetable = {"between_stops" => [], "short_name" => short_name}
        time_points = table.xpath('tr[1]//th').map do |tp|
                if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>"
                        timing_point = tp.content.squeeze(" ").gsub("Bus Station"," Bus Station ").gsub(" Platform"," (Platform").gsub("  - "," - ").gsub("\n"," ").gsub("\r"," ").gsub("\t"," ").gsub("\\"," / ").gsub("/"," / ").gsub(",",", ").gsub("\302\240","").squeeze(" ").strip
                        if (tp.content.match('Platform'))
                          timing_point.concat(")")
                        end;
                        timing_point
                end
        end
        time_points.delete(nil)
        time_points.delete("WheelchairAccessible")
        time_points.delete("Wheelchair Accessible")
        timetable["time_points"] = time_points.to_a
        timetable["long_name"] = "To " + time_points.last
        periodtimes = []
        table.css('tr').each do |row|
                times = row.css('td').map do |cell|
                        time = cell.content.squeeze(" ").strip
                        time = time.gsub(/ *A\S?M/,"a").gsub(/ ?P\S?M/,"p").gsub("12:08 AM","1208x").gsub(":","").gsub("1.","1").gsub("2.","2")
                        time = time.gsub("3.","3").gsub("4.","4")
                        time = time.gsub("5.","5").gsub("6.","6").gsub("7.","7").gsub("8.","8").gsub("9.","9").gsub("10.","10")
                        time = time.gsub("11.","11").gsub("12.","12").gsub(/\.+/,"-").gsub("\302\240","")
                        if time == "" then time = nil end 
                        time
                end
                times.delete(nil)
                if not times.empty?
                        if not (route = times.shift)
                                raise("TODO: account for shifting route numbers eg. intertown/redex 62/162")
                        end
                        periodtimes << times.to_a
                end
        end
        if periodtimes.size < 1
                raise "No times for route " + short_name + " in period " + period
        end
        timetable[period] = periodtimes.to_a
        # pp timetable
        filename = timetable["short_name"] + "-" + timetable["long_name"]+ "." + period + ".yml"
        filename = filename.downcase.gsub(" ","-").gsub("/","-").gsub("(","").gsub(")","")
        puts "Saving " + filename
        File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f|
                f.write timetable.to_yaml
        end
        timetable
end
 
Dir.glob("source-html/Route*.htm*") { |file|
        puts "Opened " + file
        doc = Nokogiri::HTML(open(file))
        # Search for nodes by css
        timetables = []
        short_name = "";
        doc.xpath('//title').each do |title|
                short_name = title.content.gsub("Route_","").gsub("Route ","").gsub(", ","/").gsub("ACTION Buses Timetable for ","").squeeze(" ").strip
        end
        if short_name == ""
                raise "Route number(s) not found in <title> tag"
        end
 
        doc.xpath('//table[preceding::text()="Weekdays"]').each do |table|
                timetables << makeTimetable(table, "stop_times", short_name)
        end
        doc.xpath('//table[preceding::text()="This timetable is effective from Monday 15th November 2010."]').each do |table|
                timetables << makeTimetable(table, "stop_times", short_name)
        end
        #all tables are weekdays on some really malformatted timetables
        if short_name == "170"
                doc.xpath('//table').each do |table|
                        timetables << makeTimetable(table, "stop_times", short_name)
                end
        end
        #weekends
        doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table|
                timetables << makeTimetable(table, "stop_times_saturday", short_name)
        end
        doc.xpath('//table[preceding::text()="Sundays"]').each do |table|
                timetables << makeTimetable(table, "stop_times_sunday",  short_name)
        end
        #930/934 special cases
        doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table|
                timetables << makeTimetable(table, "stop_times_saturday", short_name)
        end
        doc.xpath('//table[preceding::text()="Sunday"]').each do |table|
                timetables << makeTimetable(table, "stop_times_sunday",  short_name)
        end
        #route 81 = Weekdays - School Holidays Only 
        doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table|
                timetable = makeTimetable(table, "stop_times", short_name)
                #TODO set active date range to only be holidays
                timetables << timetable;
        end
 
        
        if timetables.size > 2
                puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s
        end
        if timetables.size < 2
                puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s 
        elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty?
                puts "WARNING: first pair of timetable timing points are not complementary for "+ file 
                pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse)
        end
        if timetables.size < 1
                raise "No timetables extracted from " + file
        end
}