Add html/pdf inconsistency fixes
[bus.git] / maxious-canberra-transit-feed / 01-extracttimes.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'pp'
require 'yaml'
class Array
  def to_yaml_style
    :inline
  end
end
 
 
def makeTimetable(table, period, short_name)
        timetable = {"between_stops" => [], "short_name" => short_name}
        time_points = table.xpath('tr[1]//th').map do |tp|
                if tp.content != "\302\240" && tp.content != "" && tp.content != "<br/>"
                        timing_point = tp.content.squeeze(" ").gsub("Shops"," ").gsub("Bus Station"," Bus Station ").gsub("Interchange"," Bus Station ").gsub(" Platform"," (Platform")
                        timing_point = timing_point.gsub("Machonochie","Maconochie").gsub("Hume"," ").gsub("Market Place","Marketplace").gsub("Terminus Fyshwick","Terminus")
                        timing_point = timing_point.gsub("  - "," - ").gsub("\n"," ").gsub("\r"," ").gsub("\t"," ").gsub("\\"," / ").gsub("/"," / ").gsub(","," ").gsub("\302\240","").squeeze(" ").strip
                        if (short_name == "923" or short_name == "924" or short_name == "938") and timing_point == "Pearce"
                          timing_point = "Canberra Hospital"
                        end
                        if (tp.content.match('Platform'))
                          timing_point.concat(")")
                        end
                        if tp.to_s.match(/[0-9][0-9][0-9]/) or tp.to_s.include? "Wheelchair"
                          timing_point = nil
                        end
                        timing_point
                end
        end
        time_points.delete(nil)
        timetable["time_points"] = time_points.to_a
        timetable["long_name"] = "To " + time_points.last
        periodtimes = []
        table.css('tr').each do |row|
                times = row.css('td').map do |cell|
                        time = cell.content.squeeze(" ").strip
                        time = time.gsub(/ *A\S?M/,"a").gsub(/ ?P\S?M/,"p").gsub(/ *a\S?m/,"a").gsub(/ ?p\S?m/,"p")
                        time = time.gsub("12:08 AM","1208x").gsub(":","").gsub("1.","1").gsub("2.","2").gsub("3.","3").gsub("4.","4")
                        time = time.gsub("5.","5").gsub("6.","6").gsub("7.","7").gsub("8.","8").gsub("9.","9").gsub("10.","10")
                        time = time.gsub("11.","11").gsub("12.","12").gsub(/\.+/,"-").gsub("\302\240","")
                        if time == "" or time.include? "chool" or time.include? "On Race Days" or time.include? "Bus"
                                time = nil # This hacky way is faster than using position()>1 xpath on <TD>s!
                        end 
                        time
                end
                times.delete(nil)
                if not times.empty? 
                        if not (route = times.shift)
                                raise("TODO: account for shifting route numbers eg. intertown/redex 62/162")
                        end
                        periodtimes << times.to_a
                end
        end
        if periodtimes.size < 1
                raise "No times for route " + short_name + " in period " + period
        end
        timetable[period] = periodtimes.to_a
        # pp timetable
        filename = timetable["short_name"] + "-" + timetable["long_name"]+ "." + period + ".yml"
        filename = filename.downcase.gsub(" ","-").gsub("/","-").gsub("(","").gsub(")","")
        puts "Saving " + filename
        File.open("#{File.dirname(__FILE__)}/output/"+filename, "w") do |f|
                f.write timetable.to_yaml
        end
        timetable
end
 
Dir.glob("source-html/*oute*.htm*") { |file|
        puts "Opened " + file
        doc = Nokogiri::HTML(open(file))
        # Search for nodes by css
        timetables = []
        short_name = "";
        doc.xpath('//title').each do |title|
                short_name = title.content.gsub("Route_","").gsub("Route ","").gsub("route ","").gsub(", ","/").gsub("ACTION Buses Timetable for ","").squeeze(" ").strip
        end
        if short_name == ""
                raise "Route number(s) not found in <title> tag"
        end
 
        doc.xpath('//table[preceding::text()="Weekdays"]').each do |table|
                timetables << makeTimetable(table, "stop_times", short_name)
        end
        doc.xpath('//table[preceding::text()="This timetable is effective from Monday 15th November 2010."]').each do |table|
                if short_name[0].chr != "9" or short_name.size == 1
                  timetables << makeTimetable(table, "stop_times", short_name)
                end
        end
        #all tables are weekdays on some really malformatted timetables
        if short_name == "170"
                doc.xpath('//table').each do |table|
                        timetables << makeTimetable(table, "stop_times", short_name)
                end
        end
        #weekends
        doc.xpath('//table[preceding::text()="Saturdays" and following::a]').each do |table|
                timetables << makeTimetable(table, "stop_times_saturday", short_name)
        end
        doc.xpath('//table[preceding::text()="Sundays"]').each do |table|
                timetables << makeTimetable(table, "stop_times_sunday",  short_name)
        end
        #930/934 special cases
        doc.xpath('//table[preceding::text()="Saturday" and following::h2]').each do |table|
                timetables << makeTimetable(table, "stop_times_saturday", short_name)
        end
        doc.xpath('//table[preceding::text()="Sunday"]').each do |table|
                timetables << makeTimetable(table, "stop_times_sunday",  short_name)
        end
        #route 81 = Weekdays - School Holidays Only 
        doc.xpath('//table[preceding::text()="Weekdays - School Holidays Only "]').each do |table|
                timetable = makeTimetable(table, "stop_times", short_name)
                #TODO set active date range to only be holidays
                timetables << timetable;
        end
 
        
        if timetables.size > 2
                puts "WARNING: " + file + " more than 2 timetables (weekend split?):" + timetables.size.to_s
        end
        if timetables.size < 2
                puts "WARNING: " + file + " less than 2 timetables (weekday loop service?):" + timetables.size.to_s 
        elsif not (timetables[0]["time_points"] - timetables[1]["time_points"].reverse).empty?
                puts "WARNING: first pair of timetable timing points are not complementary for "+ file 
                pp(timetables[0]["time_points"] - timetables[1]["time_points"].reverse)
        end
        if timetables.size < 1
                raise "No timetables extracted from " + file
        end
}