diff -ruN xmltv-0.5.39/grab/nl/tv_grab_nl.in xmltv-0.5.39-fixed/grab/nl/tv_grab_nl.in --- xmltv-0.5.39/grab/nl/tv_grab_nl.in 2004-11-28 23:31:01.000000000 +0100 +++ xmltv-0.5.39-fixed/grab/nl/tv_grab_nl.in 2005-05-26 19:54:15.017352000 +0200 @@ -53,8 +53,9 @@ =head1 AUTHOR -Guido Diepen and Ed Avis (ed@membled.com). Originally based on -tv_grab_fi by Matti Airas. +First version by Guido Diepen and Ed Avis (ed@membled.com). +Second version by Eric Bus (eric@fambus.nl). +Originally based on tv_grab_fi by Matti Airas. =head1 BUGS @@ -115,8 +116,7 @@ } # Function prototypes. -sub time_tot_str( $ ); -sub time_van_str( $ ); +sub time_to_str( $ ); sub get_channels( $ ); sub process_summary_page( $$$$ ); sub parse_dutch_date( $ ); @@ -328,12 +328,12 @@ foreach $ch_did (@channels) { my $ch_xid = "$ch_did.tvgids.nl"; for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) { - my $url = 'http://www.tvgids.nl/zoekprogramma.php' - . "?station=$ch_did&interval=$i"; - my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d'); - die if not defined $day; - die if ref $url; - push @to_get, [ $url, $day, $ch_xid, $ch_did ]; + my $url = 'http://www.tvgids.nl/zoeken/' + . "?station=$ch_did&genre=&interval=$i×lot=0"; + my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d'); + die if not defined $day; + die if ref $url; + push @to_get, [ $url, $day, $ch_xid, $ch_did ]; } } @@ -342,6 +342,7 @@ my @summary_page_data; my $bar = new XMLTV::ProgressBar('downloading summary', scalar @to_get) if not $opt_quiet; + foreach (@to_get) { my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_; die if ref $url; @@ -354,6 +355,7 @@ # Now we've fetched the descriptions, we know the channel names. foreach $ch_did (@channels) { + print "processing channel #$ch_did\n"; $ch_name = $channels{$ch_did}; die "did not see any name for $ch_did in any listing pages" if not defined $ch_name; @@ -364,6 +366,7 @@ my @summary_programmes; my %detail_url_to_summary_url; + foreach (@summary_page_data) { my ($summary_url, $ch_xmltv_id, $data) = @$_; @@ -373,142 +376,151 @@ # my $clump; # [ start, stop, programmes ]. stop may be undef. my @clumps; + foreach (@$data) { - my ($start, $stop, $title, $url) = @$_; - die if ref $url; - local $SIG{__WARN__} = sub { warn "$url: $_[0]" }; - for ($detail_url_to_summary_url{$url}) { - warn "more than one programme with same details page $url" - if defined; - $_ = $summary_url; - } + my ($start, $stop, $title, $url) = @$_; + die if ref $url; + local $SIG{__WARN__} = sub { warn "$url: $_[0]" }; + for ($detail_url_to_summary_url{$url}) { + warn "more than one programme with same details page $url" + if defined; + $_ = $summary_url; + } - # Start and stop are common to a whole clump, but these - # two are given individually for each programme. - # - my $details = [ $title, $url ]; + # Start and stop are common to a whole clump, but these + # two are given individually for each programme. + # + my $details = [ $title, $url ]; - if (not defined $start) { - if (not $clump) { - warn "programme '$title' at beginning of page has no start time, dropping\n"; - next; - } - t 'found programme with no start time'; + if (not defined $start) { + if (not $clump) { + warn "programme '$title' at beginning of page has no start time, dropping\n"; + next; + } + t 'found programme with no start time'; - if (defined $clump->[1]) { - t 'make it start at stop of last clump'; - die if not defined $clump->[0]; - push @clumps, $clump; - $clump = [ $clump->[1], $stop, [ $details ] ]; - die if not defined $clump->[0]; - } - else { - t 'current clump has no stop, add to clump'; - push @{$clump->[2]}, $details; - t 'maybe set stop of current clump'; - $clump->[1] = $stop; - } - } - else { - t 'programme has start time, make new clump'; - if ($clump) { - die if not defined $clump->[0]; - my $cmp = Date_Cmp($clump->[0], $start); - if ($cmp == 0) { - # Oddity in the web pages: this programme has the - # same start time as a previous one. (Until I - # found this case, the clumping was only for cases - # where a programme lacked a start time, I think.) - # Anyway handle this by making it join the - # existing clump. - # - t 'same start time as existing clump, join it'; - - if (defined $stop) { - # Compare stop time of the current clump with - # this programme and extend the clump if - # necessary. - # - my $cmp = Date_Cmp($clump->[1], $stop); - if ($cmp < 0) { - # Okay, later stop time, extend clump. - $clump->[1] = $stop; - } - elsif ($cmp == 0) { - # Okay. - } - elsif ($cmp > 0) { - warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n"; - } - else { die } - } - - push @{$clump->[2]}, $details; - } - else { - # (Don't bother checking that $cmp < 0, it should - # be, but often programmes appear out of order...) - # - push @clumps, $clump; - $clump = [ $start, $stop, [ $details ] ]; - t 'started a new clump for new start time, now: ' . d $clump; - } - } - else { - t 'no existing clump, starting one for this programme'; - $clump = [ $start, $stop, [ $details ] ]; - } - die if not defined $clump->[0]; - } + if (defined $clump->[1]) { + t 'make it start at stop of last clump'; + die if not defined $clump->[0]; + push @clumps, $clump; + $clump = [ $clump->[1], $stop, [ $details ] ]; + die if not defined $clump->[0]; + } + else { + t 'current clump has no stop, add to clump'; + push @{$clump->[2]}, $details; + t 'maybe set stop of current clump'; + $clump->[1] = $stop; + } + } + else { + t 'programme has start time, make new clump'; + if ($clump) { + die if not defined $clump->[0]; + my $cmp = Date_Cmp($clump->[0], $start); + + if ($cmp == 0) { + # Oddity in the web pages: this programme has the + # same start time as a previous one. (Until I + # found this case, the clumping was only for cases + # where a programme lacked a start time, I think.) + # Anyway handle this by making it join the + # existing clump. + # + t 'same start time as existing clump, join it'; + + if (defined $stop) { + # Compare stop time of the current clump with + # this programme and extend the clump if + # necessary. + # + my $cmp = Date_Cmp($clump->[1], $stop); + if ($cmp < 0) { + # Okay, later stop time, extend clump. + $clump->[1] = $stop; + } + elsif ($cmp == 0) { + # Okay. + } + elsif ($cmp > 0) { + warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n"; + } + else { die } + } + + push @{$clump->[2]}, $details; + } + else { + # (Don't bother checking that $cmp < 0, it should + # be, but often programmes appear out of order...) + # + push @clumps, $clump; + $clump = [ $start, $stop, [ $details ] ]; + t 'started a new clump for new start time, now: ' . d $clump; + } + } + else { + t 'no existing clump, starting one for this programme'; + $clump = [ $start, $stop, [ $details ] ]; + } + + die if not defined $clump->[0]; + } } + if ($clump) { - die if not defined $clump->[0]; - push @clumps, $clump; + die if not defined $clump->[0]; + push @clumps, $clump; } + t '\@clumps=' . d \@clumps; - # Now add the clumpidx attributes. - foreach (@clumps) { - my ($start, $stop, $l) = @$_; - die if not defined $start; - my $num_in_clump = @$l; - if ($num_in_clump == 1) { - # Common case, no clumpidx needed. - } - elsif ($num_in_clump > 1) { - foreach my $i (0 .. $num_in_clump - 1) { - # Add clumpidx as last thing in list. - push @{$l->[$i]}, "$i/$num_in_clump"; - } - } - else { die } + # Now add the clumpidx attributes. + foreach (@clumps) { + my ($start, $stop, $l) = @$_; + die if not defined $start; + my $num_in_clump = @$l; + if ($num_in_clump == 1) { + # Common case, no clumpidx needed. + } + elsif ($num_in_clump > 1) { + foreach my $i (0 .. $num_in_clump - 1) { + # Add clumpidx as last thing in list. + push @{$l->[$i]}, "$i/$num_in_clump"; + } + } + else { die } } + t 'after adding clumpidxes, \@clumps=' . d \@clumps; # Finally turn the data into programmes. foreach (@clumps) { - my ($start, $stop, $l) = @$_; - die if not defined $start; - foreach (@$l) { - my ($title, $url, $clumpidx) = @$_; - my %h = (channel => $ch_xmltv_id, - title => [ [ $title, $LANG ] ], - ); - for (date_to_local($start, $TZ)) { - $h{start} = UnixDate($_->[0], '%q') . " $_->[1]"; + my ($start, $stop, $l) = @$_; + die if not defined $start; + foreach (@$l) { + my ($title, $url, $clumpidx) = @$_; + my %h = (channel => $ch_xmltv_id, + title => [ [ $title, $LANG ] ], + ); + + for (date_to_local($start, $TZ)) { + $h{start} = UnixDate($_->[0], '%q') . " $_->[1]"; } + if (defined $stop) { - for (date_to_local($stop, $TZ)) { - $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]"; - } + for (date_to_local($stop, $TZ)) { + $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]"; + } } if (defined $url) { - die if ref $url; - $h{url} = [ $url ]; + die if ref $url; + $h{url} = [ $url ]; } - $h{clumpidx} = $clumpidx if defined $clumpidx; - push @summary_programmes, \%h; + + $h{clumpidx} = $clumpidx if defined $clumpidx; + push @summary_programmes, \%h; } } } @@ -573,33 +585,33 @@ my $new = $detailed->{$_}; if ($_ eq 'title') { - # We know how to merge this. TODO write general - # XMLTV::Merge. - # - my %already; - foreach my $a (@$new) { - my $d = Dumper($a); - $already{$d}++ && warn "duplicate $_: $d"; - } - foreach my $o (@$old) { - my $d = Dumper($o); - push @$new, $o unless $already{$d}; - } + # We know how to merge this. TODO write general + # XMLTV::Merge. + # + my %already; + foreach my $a (@$new) { + my $d = Dumper($a); + $already{$d}++ && warn "duplicate $_: $d"; + } + foreach my $o (@$old) { + my $d = Dumper($o); + push @$new, $o unless $already{$d}; + } } else { - # Compare the two data structures. For this to work - # correctly it requires Data::Dumper 2.12 or later, as - # shipped with perl 5.8.0. Older versions don't - # support $Sortkeys. But we don't have any version - # check here - in the worst case all that results from - # using an older Data::Dumper is a few spurious - # warning messages. - # - my $old_dump = Dumper($old); - my $new_dump = Dumper($new); - if ($old_dump ne $new_dump) { - warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n"; - } + # Compare the two data structures. For this to work + # correctly it requires Data::Dumper 2.12 or later, as + # shipped with perl 5.8.0. Older versions don't + # support $Sortkeys. But we don't have any version + # check here - in the worst case all that results from + # using an older Data::Dumper is a few spurious + # warning messages. + # + my $old_dump = Dumper($old); + my $new_dump = Dumper($new); + if ($old_dump ne $new_dump) { + warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n"; + } } } @@ -629,20 +641,22 @@ my $warned_bad_chars; sub tidy( $ ) { for (my $tmp = shift) { - tr/\221\222/''/; - if (tr/\012\015\040-\176\240-\377//dc) { - warn 'removing bad characters' unless $warned_bad_chars++; - } - return $_; + tr/\221\222/''/; + if (tr/\012\015\040-\176\240-\377//dc) { + warn 'removing bad characters' unless $warned_bad_chars++; + } + return $_; } } # Returns a programme hashref, or undef, or the magic 'END'. sub process_details_page( $$$ ) { foreach (@_) { die if ref } + my ($ch_xmltv_id, $url, $master_url) = @_; + local $SIG{__WARN__} = sub { - warn "$url (from $master_url): $_[0]"; + warn "$url (from $master_url): $_[0]"; }; # We make an HTML::TreeBuilder object, get the information @@ -650,298 +664,299 @@ # my $t = new HTML::TreeBuilder(); eval { - $t->parse(tidy(get_nice($url))); + $t->parse(tidy(get_nice($url))); }; + if ($@) { - warn "error getting/parsing $url: $@"; - return; + warn "error getting/parsing $url: $@"; + return; } - my @elems = $t->look_down(class => 'detailDeel'); - if (not @elems) { - warn "did not see any 'detailDeel' elements, skipping page"; - return; + + my @elems = $t->look_down('id' => 'progDetail'); + if (not @elems or @elems != 1) { + warn "did not see one single 'progDetail' element, skipping page"; + return; } + + my $elem = $elems[0]; + my @hs = $elem->look_down('_tag' => 'h3'); + if (not @hs or @hs != 2) { + warn "could not find a valid 'h3' title, skipping page"; + return; + } + + my $naam = $hs[0]->as_text(); + $naam =~ s/^\s+//; $naam =~ s/\s+$//; + + my @desc; # accumulate bits + my @ps = $elem->look_down('_tag' => 'p'); + if (not @ps) { + warn "could not find a valid description, skipping page"; + return; + } + + for( my $i=0; $i<=2; $i++ ) + { + my $text = $ps[$i]->as_text(); + $text =~ s/^\s+//; + $text =~ s/\s+$//; + push @desc, $text if $text ne ''; + } + + @elems = $t->look_down('id' => 'data'); + if (not @elems or @elems != 1) { + warn "did not see one single 'data' element, skipping page"; + return; + } + my @info; - foreach (@elems) { - my @cont = grep { ref } $_->content_list(); - my $n = scalar @cont; - if ($n != 2) { - warn "'detailDeel' has $n elements instead of 2"; - next; - } - my ($k, $v) = @cont; - for ($k->attr('class')) { - if (not defined or $_ ne 'detailLabel2') { - warn "didn't see 'detailLabel2' in 'detailDeel'"; - next; - } - } - for ($v->attr('class')) { - if (not defined or $_ ne 'detailContent2') { - warn "didn't see 'detailContent2' in 'detailDeel'"; - next; - } - } - push @info, [ $k->as_text(), $v->as_text() ]; + $elem = $elems[0]; + my @trs = $elem->look_down('_tag' => 'tr'); + if( not @trs ) { + warn "did not find records below the 'data' element, skipping page"; + return; + } + + foreach (@trs) { + my $tr = $_; + my @ths = $tr->look_down('_tag' => 'th'); + my @tds = $tr->look_down('_tag' => 'td'); + if( @ths == 1 && @tds == 1 ) { + push @info, [ $ths[0]->as_text(), $tds[0]->as_text() ]; + } } $t->delete(); undef $t; # Process the list of [ heading, data ] pairs. my ( - # Exactly one: - $van, $tot, $naam, + # Exactly one: + $van, $tot, - # At most one: + # At most one: $director, $previously_shown, $orig_title, $sub_title, $genre, $date, $episode_num, $actors, $writers, $commentators, - # Zero or more: + # Zero or more: @presenter, @url, - ); + ); + # NB 'at most one' $actors but that one entry can give several. my ($teletext_sub, $widescreen) = 0; # boolean - my @desc; # accumulate bits my $seen_tijdstip = 0; my $last; - ELEM: foreach (@info) { - my ($regel, $text) = @$_; - foreach ($regel, $text) { - s/^\s+//; s/\s+$//; - } - - if ($regel eq '') { - # Continuation of the previous one, hopefully. - $regel = $last; - } - else { - # They usually end with a colon but not always. - $regel =~ s/:$//; - $last = $regel; - } - if ($regel eq 'Tijdstip') { - warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++; - if (length($text)<=16) { - t "'onvolledig' is true, nothing more to write"; - t 'process_details_page() RETURNING'; - return 'END'; - } - else { - # Extract time strings from the text, but not full - # Date::Manip objects. - # - $van = time_van_str($text); - $tot = time_tot_str($text); - } - } - elsif ($regel eq 'Inhoud') { - # Empty text for this happens often, just skip it. - push @desc, $text if $text ne ''; - } - elsif ($regel eq 'Programma') { - warn "seen 'Programma' twice\n" if defined $naam; - # FIXME should really look for 'herhaling' in italics. - if ($text =~ s/\bherhaling\s+van\s+(\d\d?)-(\d\d?)-(\d{4})//) { - warn "seen previously-shown information twice\n" - if $previously_shown; - my ($dd, $mm, $yyyy) = ($1, $2, $3); - $previously_shown = { start => "$yyyy$mm$dd" }; - $text =~ s/^\s+//; $text =~ s/\s+$//; - } - elsif ($text =~ s/herhaling\b//) { - # Repeat, but no previous date given. NB - # sometimes we see 'herhaling' without a space - # before it, as in the redundant - # - # 'Netwerk herhalingenherhaling' - # - # Hence no \b at the start of the regexp. We just - # have to hope there aren't too many compound - # words ending in 'herhaling'. - # - $previously_shown = {}; - $text =~ s/^\s+//; $text =~ s/\s+$//; - } - $naam = $text; - } - elsif ($regel eq 'Genre') { - warn "seen 'Genre' twice\n" if defined $genre; - # Empty text for this happens often, just skip it. - $genre = $text if $text ne ''; - } - elsif ($regel eq 'Zender') { - # I think this means 'broadcaster' but the information - # is redundant because we already know the channel. - # - # Then we should check it and warn if it differs! But - # it does differ - every programme on the channel - # Nederland 1, it seems, has Zender of 'Nederland - # 2'. So we just ignore this information. - # - } - elsif ($regel eq 'Omroep') { - # FIXME I don't know what this means (the dictionary - # says 'wireless telegraph' but that's no help) so - # just ignore it. - # - } - elsif ($regel eq 'Kenmerken') { - foreach (split /,\s*/, $text) { - if ($_ eq 'Teletekst ondertiteld') { - # I'm guessing this means teletext subtitles :-). - $teletext_sub++ - && warn 'seen teletext subtitles twice'; - } - elsif ($_ eq 'Breedbeeld uitzending') { - $widescreen++ && warn 'seen widescreen twice'; - } - elsif (length >= 50) { - # Some long sentence, part of description. - push @desc, $_; - } - else { - warn "unknown 'Kenmerken' bit $_" - unless $warned_regel{"Kenmerken: $_"}++; - push @desc, $_; - } - } - } - elsif ($regel eq 'Presentatie') { - push @presenter, $text; - } - elsif ($regel eq 'Afleverings nummer') { - warn "seen 'Afleverings nummer' twice" - if defined $episode_num; - if ($text eq 'Slot') { - # The last episode of a series. There isn't a way to - # store this in the current XMLTV format. - # - warn "discarding 'Slot'" unless $warned_slot++; - } - elsif ($text =~ /^\d+$/) { - if ($text == 0) { - warn "I thought episode nums on the site were from 1"; - } - else { - $episode_num = $text - 1; - } - } - elsif ($text =~ /^(?:\d+-)+\d$/) { - # This means multiple episodes. This ought to be - # handled by turning the programme into a clump. - # - warn "programme covers multiple episodes ($text), not handled"; - } - else { - warn "bad episode number $text"; - } - } - elsif ($regel eq 'Titel aflevering') { - warn "seen 'Titel aflevering' twice" - if defined $sub_title; - $sub_title = $text; - } - elsif ($regel eq 'Webpagina') { - # We have to turn the string given, which is normally - # just a hostname, into a URL. I don't see why they - # don't just link to it directly, this is a web site - # after all. - # - # Anyway, the URI library doesn't seem to have any way - # to take a string and turn it into a URL adding - # 'http:' if necessary, so we do this by hand. - # - if ($text !~ tr/://) { - $text = "http://$text"; - } - push @url, $text; - } - elsif ($regel eq 'Rolverdeling') { - warn "seen 'Rolverdeling' twice" if $actors; - - # 'e.a' appearing in the description means 'and others'; - # it's implicit in XMLTV that there might be other actors, - # so we quietly remove it. - # - $text =~ s/\s*e\.a\s*$//; - - while (length $text) { - if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) { - warn "discarding information about the parts played by each actor\n" - unless $warned_discarding_parts++; - push @$actors, $2; - } - elsif ($text =~ s/\s*([^,]+)(?:$|,)//) { - push @$actors, $1; - } - else { - warn "unknown remnant 'Rolverdeling' text '$text'"; - last; - } - } - } - elsif ($regel eq 'Scenario schrijver') { - warn "seen 'Scenario schrijver' twice" if $writers; - push @$writers, $text; - } - elsif ($regel eq 'E-mail') { - push @url, "mailto:$text"; - } - elsif ($regel eq 'Bron') { - # FIXME cannot do anything special with this. It - # means 'source' and perhaps by parsing the text we - # could find the names of writers or whatever. - # - push @desc, "$regel: $text"; - } - elsif ($regel eq 'Commentaar') { - push @$commentators, $text; - } - elsif ($regel eq 'Jaar van premiere') { - # Year of release, I think. - warn "seen 'Jaar van premiere' twice" - if defined $date; - $date = $text; - } - elsif ($regel eq 'Regisseur') { - warn "seen 'Regisseur' twice" if defined $director; - $director = $text; - } - elsif ($regel eq 'Orginele titel') { - warn "seen 'Orginele titel' twice" if defined $orig_title; - $orig_title = $text; - } - elsif ($regel eq 'Behaalde prijzen') { - # Awards won. It doesn't seem worth adding a separate - # field for this to the XMLTV format, just append to - # the description. - # - push @desc, "$regel: $text"; - } - elsif ($regel eq 'Website') { - push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text"; - } - else { - # Unknown key, but let's add it to the desc so we - # don't lose information. These newlines are just for - # the benefit of someone reading the XML by hand. - # - push @desc, "$regel: $text"; - warn "unknown programme info key $regel\n" - unless $warned_regel{$regel}++; - } + ELEM: foreach (@info) { + my ($regel, $text) = @$_; + foreach ($regel, $text) { + s/^\s+//; s/\s+$//; + } + + if ($regel eq '') { + # Continuation of the previous one, hopefully. + $regel = $last; + } + else { + # They usually end with a colon but not always. + $regel =~ s/:$//; + $last = $regel; + } + + if ($regel eq 'Datum en tijdstip') { + warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++; + if (length($text)<=16) { + t "'onvolledig' is true, nothing more to write"; + t 'process_details_page() RETURNING'; + return 'END'; + } + else { + # Extract time strings from the text, but not full + # Date::Manip objects. + # + ($van,$tot) = time_to_str($text); + } + } + elsif ($regel eq 'Inhoud') { + # Empty text for this happens often, just skip it. + push @desc, $text if $text ne ''; + } + elsif ($regel eq 'Genre') { + warn "seen 'Genre' twice\n" if defined $genre; + # Empty text for this happens often, just skip it. + $genre = $text if $text ne ''; + } + elsif ($regel eq 'Zender') { + # In the new layout, this field contains the logo + # for the channel. Ignore it, because these logos + # are too small to use in an application. + # + } + elsif ($regel eq 'Omroep') { + # We ignore this setting, because the XMLTV format + # doesn't have room for this. In Holland, a few broadcasters + # share the same channel. + # + } + elsif ($regel eq 'Bijzonderheden') { + foreach (split /,\s*/, $text) { + if ($_ eq 'Teletekst ondertiteld') { + # I'm guessing this means teletext subtitles :-). + $teletext_sub++ + && warn 'seen teletext subtitles twice'; + } + elsif ($_ eq 'Breedbeeld uitzending') { + $widescreen++ && warn 'seen widescreen twice'; + } + elsif (length >= 50) { + # Some long sentence, part of description. + push @desc, $_; + } + else { + warn "unknown 'Bijzonderheden' bit $_" + unless $warned_regel{"Bijzonderheden: $_"}++; + push @desc, $_; + } + } + } + elsif ($regel eq 'Teletekst') { + # Teletekst contains the 'teletext' page for this programme + # This information isn't used at the moment + } + elsif ($regel eq 'Presentatie') { + push @presenter, $text; + } + elsif ($regel eq 'Aflevering') { + warn "seen 'Aflevering" + if defined $episode_num; + if ($text eq 'Slot') { + # The last episode of a series. There isn't a way to + # store this in the current XMLTV format. + # + warn "discarding 'Slot'" unless $warned_slot++; + } + elsif ($text =~ /^\d+$/) { + if ($text == 0) { + warn "I thought episode nums on the site were from 1"; + } + else { + $episode_num = $text - 1; + } + } + elsif ($text =~ /^(?:\d+-)+\d$/) { + # This means multiple episodes. This ought to be + # handled by turning the programme into a clump. + # + warn "programme covers multiple episodes ($text), not handled"; + } + else { + warn "bad episode number $text"; + } + } + elsif ($regel eq 'Titel aflevering') { + warn "seen 'Titel aflevering' twice" + if defined $sub_title; + $sub_title = $text; + } + elsif ($regel eq 'Url') { + # We have to turn the string given, which is normally + # just a hostname, into a URL. I don't see why they + # don't just link to it directly, this is a web site + # after all. + # + # Anyway, the URI library doesn't seem to have any way + # to take a string and turn it into a URL adding + # 'http:' if necessary, so we do this by hand. + # + if ($text !~ tr/://) { + $text = "http://$text"; + } + push @url, $text; + } + elsif ($regel eq 'Acteurs') { + warn "seen 'Rolverdeling' twice" if $actors; + + # 'e.a' appearing in the description means 'and others'; + # it's implicit in XMLTV that there might be other actors, + # so we quietly remove it. + # + $text =~ s/\s*e\.a\s*$//; + + while (length $text) { + if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) { + warn "discarding information about the parts played by each actor\n" + unless $warned_discarding_parts++; + push @$actors, $2; + } + elsif ($text =~ s/\s*([^,]+)(?:$|,)//) { + push @$actors, $1; + } + else { + warn "unknown remnant 'Rolverdeling' text '$text'"; + last; + } + } + } + elsif ($regel eq 'Scenario') { + warn "seen 'Scenario' twice" if $writers; + push @$writers, $text; + } + elsif ($regel eq 'Email') { + push @url, "mailto:$text"; + } + elsif ($regel eq 'Bron') { + # FIXME cannot do anything special with this. It + # means 'source' and perhaps by parsing the text we + # could find the names of writers or whatever. + # + # push @desc, "$regel: $text"; + } + elsif ($regel eq 'Commentaar') { + push @$commentators, $text; + } + elsif ($regel eq 'Jaar van premiere') { + # Year of release, I think. + warn "seen 'Jaar van premiere' twice" + if defined $date; + $date = $text; + } + elsif ($regel eq 'Regisseur') { + warn "seen 'Regisseur' twice" if defined $director; + $director = $text; + } + elsif ($regel eq 'Orginele titel') { + warn "seen 'Orginele titel' twice" if defined $orig_title; + $orig_title = $text; + } + elsif ($regel eq 'Behaalde prijzen') { + # Awards won. It doesn't seem worth adding a separate + # field for this to the XMLTV format, just append to + # the description. + # + # push @desc, "$regel: $text"; + } + elsif ($regel eq 'Website') { + push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text"; + } + else { + # Unknown key, report it back to the prompt + # + warn "unknown programme info key $regel\n" + unless $warned_regel{$regel}++; + } } if (not defined $naam) { - warn "did not see programme title, skipping programme\n"; - return; + warn "did not see programme title, skipping programme\n"; + return; } if (not defined $van) { - warn "did not see programme times, skipping programme\n"; - return; + warn "did not find programme starttime, skipping programme\n"; + return; + } + if (not defined $tot) { + warn "did not see programme endtime, skipping programme\n"; + return; } - die if not defined $tot; my @title = ([ $naam, $LANG]); push @title, [ $orig_title ] if defined $orig_title; # not Dutch! @@ -983,41 +998,32 @@ return \%prog; } -sub time_tot_str( $ ) { +sub time_to_str( $ ) { my $input = shift; - if (length($input) == 15) { - $input .= '05:00'; - } - my $datum = substr($input,0,length($input)-11); - my $tot = substr($input,-5); - - $datum =~ /(\d\d?)-(\d\d?)-(\d+)/ - or die "cannot find year in '$datum'"; - my ($dd, $mm, $yyyy) = ($1, $2, $3); - foreach ($dd, $mm) { $_ = "0$_" if length == 1 } - $tot =~ /(\d\d):(\d\d)/ - or die "cannot find time in '$tot'"; - my ($HH, $MM) = ($1, $2); - return "$yyyy-$mm-$dd $HH:$MM:00"; -} -sub time_van_str( $ ) { - my $input = shift; - if (length($input) == 15) { - $input .= '06:00'; + # Replace months + $input =~ s/\bjanuari\b/1/g; + $input =~ s/\bfebruari\b/2/g; + $input =~ s/\bmaart\b/3/g; + $input =~ s/\bapril\b/4/g; + $input =~ s/\bmei\b/5/g; + $input =~ s/\bjuni\b/6/g; + $input =~ s/\bjuli\b/7/g; + $input =~ s/\baugustus\b/8/g; + $input =~ s/\bseptember\b/9/g; + $input =~ s/\boktober\b/10/g; + $input =~ s/\bnovember\b/11/g; + $input =~ s/\bdecember\b/12/g; + + if( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d:\d\d)-(\d\d:\d\d) uur/ ) { + return ( "$3-$2-$1 $4:00", "$3-$2-$1 $5:00" ); + } + elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), -(\d\d):(\d\d) uur/ ) { + return ( "$4-$3-$2 $4:".($5-30).":00", "$3-$2-$1 $4:$5:00" ); + } + elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d):(\d\d)- uur/ ) { + return ( "$3-$2-$1 $4:$5:00", "$3-$2-$1 $4:".($5+30).":00" ); } - my $datum = substr($input,0,length($input)-11); - my $van = substr($input,-11); - $van = substr($van,0,5); - - $datum =~ /(\d\d?)-(\d\d?)-(\d{4})/ - or die "cannot find year in '$datum'"; - my ($dd, $mm, $yyyy) = ($1, $2, $3); - foreach ($dd, $mm) { $_ = "0$_" if length == 1 } - $van =~ /(\d\d):(\d\d)/ - or die "cannot find time in '$van'"; - my ($HH, $MM) = ($1, $2); - return "$yyyy-$mm-$dd $HH:$MM:00"; } @@ -1032,40 +1038,44 @@ my %channels; if ($sanity) { - # Download what we can. - my $url = 'http://www.tvgids.nl'; - #All stations are in the select box. - #The station ID is the option value - my $t = new HTML::TreeBuilder(); - $t->parse(get_nice($url)); - my @conts = map { [ $_->content_list() ] } - $t->look_down('_tag' => 'select', 'name' => 'station'); - foreach my $cont (@conts) { - my @children =@$cont; - if (scalar(@children) == 0) { - warn 'No stations are defined'; - next; - } - foreach my $station_line (@children) { - if ($station_line ne ' ') { - #This if statement is to prevent parsing the last - #empty element from the list. - - my $channel_id = $station_line->attr('value'); - - #I am only interested in the normal channels. - #tvgids.nl has some pages for the regional stations also - #All normal channels have id <0,100> - #That is at the moment... Could change in future... - if ($channel_id > 0 && $channel_id < 100) { - my $channel_name = $station_line->as_text(); - $channels{$channel_id} = $channel_name - } - } - - } - } - $t->delete(); undef $t; + # Download what we can. + my $url = 'http://www.tvgids.nl'; + #All stations are in the select box. + #The station ID is the option value + my $t = new HTML::TreeBuilder(); + $t->parse(get_nice($url)); + my @conts = map { [ $_->content_list() ] } + $t->look_down('_tag' => 'select', 'name' => 'station'); + + foreach my $cont (@conts) { + my @children =@$cont; + + if (scalar(@children) == 0) { + warn 'No stations are defined'; + next; + } + + foreach my $station_line (@children) { + if ($station_line ne ' ') { + # This if statement is to prevent parsing the last + # empty element from the list. + + my $channel_id = $station_line->attr('value'); + + # I am only interested in the normal channels. + # tvgids.nl has some pages for the regional stations also + # All normal channels have id <0,300> + # That is at the moment... Could change in future... + if ($channel_id && $channel_id > 0 && $channel_id < 300) { + print $channel_id."\n"; + my $channel_name = $station_line->as_text(); + $channels{$channel_id} = $channel_name + } + } + } + } + + $t->delete(); undef $t; } # Now read the config file and check against any data we already @@ -1138,151 +1148,144 @@ # or else drop it with a warning. # my %warned_bad_channel_name; + sub process_summary_page( $$$$ ) { my ($url, $day, $official_day, $ch_name_ref) = @_; die if not defined $url; die if ref $url; die if not defined $day; die if ref $day; die if not defined $official_day; die if ref $official_day; + local $SIG{__WARN__} = sub { - warn "$url: $_[0]"; + warn "$url: $_[0]"; }; + local $SIG{__DIE__} = sub { - die "$url: $_[0]"; + die "$url: $_[0]"; }; + my $t = new HTML::TreeBuilder; $t->parse(get_nice($url)); - my %interesting; ++ $interesting{$_} foreach - qw(lijst_zender lijst_tijd details_programma); - my @elems = $t->look_down(sub { die if not defined $_[0]; - my $c = $_[0]->attr('class'); - return 0 if not defined $c; - return $interesting{$c} }); + my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht'); + if (not @elems) { - warn 'did not find any programmes in page'; - return (); + warn 'did not find any programmes in page'; + return (); } + my @bits; + foreach my $e (@elems) { - my $class = $e->attr('class'); - t "looking at elem of class $class"; - my $href; - check_content: - my @cont = $e->content_list(); - my $got_href = $e->attr('href'); - if (defined $got_href) { - t "got href: $got_href"; - warn "seen 'href's contained inside each other" - if defined $href; - $href = $got_href; - } - if (not @cont) { - warn "found $class elem without content, ignoring" - unless $class eq 'lijst_zender'; - next; - } - elsif (@cont == 1) { - for ($cont[0]) { - if (ref) { - # Unpack this extra layer of element. - $e = $_; - goto check_content; - } - s/^\s+//; s/\s+$//; - push @bits, [ $class, $href, $_ ]; - } - } - elsif (@cont > 1) { - warn "found $class elem with more than one elem inside, ignoring" - unless $class eq 'lijst_zender'; - next; - } + my @cont = $e->look_down('_tag' => 'tr'); + my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht'); + + if (not @elems) { + warn 'did not find any programmes in page'; + return (); + } } - $t->delete(); undef $t; + + foreach my $e (@elems) { + my @h4elems = $e->look_down('_tag' => 'h4'); + if (not @h4elems) { + warn 'did not find any date in page'; + return (); + } + else { + push @bits, [ $h4elems[0]->as_text(), $_ ]; + } + + my @cont = $e->look_down('_tag' => 'tr'); + foreach my $tr (@cont) { + # Fetch the time + my @ths = $tr->look_down('_tag' => 'th'); + my @tds = $tr->look_down('_tag' => 'td'); + + if( @ths == 1 and @tds > 1 ) { + my $th = $ths[0]; my $td1 = $tds[0]; my $td2 = $tds[1]; + my @ahs = $td1->look_down('_tag' => 'a'); + if( @ahs == 1 ) { + push @bits, [ $th->as_text(), $td1->as_text(), $td2->as_text(), $ahs[0]->attr('href'), $_ ]; + } + } + } + + } + + $t->delete(); + undef $t; + if (not @bits) { - warn "did not see any content, skipping page"; - return (); + warn "did not see any content, skipping page"; + return (); } + t 'got bits: ' . d @bits; - if ($bits[0]->[0] eq 'lijst_zender') { - my $date_str = $bits[0]->[2]; - my $d = parse_dutch_date($date_str); - if (defined $d) { - my ($d_base, $d_tz) = @{date_to_local($d, $TZ)}; - die "date in page $d_base ($date_str) doesn't match expected $official_day" - if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q'); - shift @bits; - } - # otherwise, leave it for later processing - } + + my $date_str = $bits[0]->[0]; + my $d = parse_dutch_date($date_str); + if (defined $d) { + my ($d_base, $d_tz) = @{date_to_local($d, $TZ)}; + die "date in page $d_base ($date_str) doesn't match expected $official_day" + if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q'); + shift @bits; + } + my @todo; - while (@bits >= 3) { - my $ch_bit = shift @bits; - my $ch_class = $ch_bit->[0]; - if ($ch_class ne 'lijst_zender') { - warn "bit expected to be channel name has class $ch_class not lijst_zender, skipping"; - next; - } - my $ch = $ch_bit->[2]; - t 'shifted bit to get $ch=' . d $ch; - for ($$ch_name_ref) { - if (not defined) { $_ = $ch } - elsif ($_ ne $ch) { - # Most likely the channels file has the wrong name. - warn "expected channel name $_, got $ch\n" - unless $warned_bad_channel_name{$_}{$ch}++; - $_ = $ch; - } - } + + while (@bits > 0) { + my $bit = shift @bits; + + my $ch = $bit->[2]; + t 'shifted bit to get $ch=' . d $ch; + for ($$ch_name_ref) { + if (not defined) { $_ = $ch } + elsif ($_ ne $ch) { + # Most likely the channels file has the wrong name. + warn "expected channel name $_, got $ch\n" + unless $warned_bad_channel_name{$_}{$ch}++; + $_ = $ch; + } + } - my $times_bit = shift @bits; - my $times_class = $times_bit->[0]; - if ($times_class ne 'lijst_tijd') { - warn "bit expected to be times has class $times_class not lijst_tijd, skipping"; - next; - } - my $times = $times_bit->[2]; - t 'shifted bit to get $times=' . d $times; + my $times = $bit->[0]; + t 'shifted bit to get $times=' . d $times; + + my $title_href = $bit->[3]; + my $title = $bit->[1]; + t 'shifted bit to get $title_href=' . d $title_href . ', ' + . '$title=' . d $title; + + if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) { + # A certain programme on the hour. But it isn't worth + # adding this to the output, ignore it. + # + next; + } + + if ($title eq 'NB: Programmering onder voorbehoud') { + # Programming subject to change. There isn't a way to + # represent this in the current XMLTV format. + # + next; + } - my $title_bit = shift @bits; - my $title_class = $title_bit->[0]; - if ($title_class ne 'details_programma') { - warn "bit expected to be title has class $title_class not details_programma, skipping"; - next; - } - my $title_href = $title_bit->[1]; - my $title = $title_bit->[2]; - t 'shifted bit to get $title_href=' . d $title_href . ', ' - . '$title=' . d $title; - - if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) { - # A certain programme on the hour. But it isn't worth - # adding this to the output, ignore it. - # - next; - } - if ($title eq 'NB: Programmering onder voorbehoud') { - # Programming subject to change. There isn't a way to - # represent this in the current XMLTV format. - # - next; - } - - my ($start_hhmm, $stop_hhmm); - if ($times =~ /^(\d\d):(\d\d)-/ - and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { - $start_hhmm = "$1:$2"; - } - if ($times =~ /-(\d\d):(\d\d)$/ - and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { - $stop_hhmm = "$1:$2"; - } - - # Right, got channel name, times, and title. - # FIXME should check channel name (among other things). - # - my $title_url = URI->new_abs($title_href, $url); - # We prefer to handle URLs as strings. - push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ]; + my ($start_hhmm, $stop_hhmm); + if ($times =~ /^(\d\d):(\d\d) - / + and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { + $start_hhmm = "$1:$2"; + } + if ($times =~ / - (\d\d):(\d\d)$/ + and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { + $stop_hhmm = "$1:$2"; + } + + # Right, got channel name, times, and title. + # FIXME should check channel name (among other things). + # + my $title_url = URI->new_abs($title_href, $url); + + # We prefer to handle URLs as strings. + push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ]; } # Now we need to make some sense of the times. When stop time @@ -1296,91 +1299,98 @@ my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d'); my $crossing_at; # local $Log::TraceMessages::On = 1; + t 'looking at raw times and searching for midnight crossing: ' . d \@todo; + for (my $i = 0; $i < @todo; ++$i) { - my ($start_hhmm, $stop_hhmm) = @{$todo[$i]}; - t '$start_hhmm=' . d $start_hhmm; - t '$stop_hhmm=' . d $stop_hhmm; - next if not defined $start_hhmm; - next if not defined $stop_hhmm; - - my $start = parse_local_date("$day $start_hhmm", $TZ); - my $stop = parse_local_date("$day $stop_hhmm", $TZ); - t "checking if $start -> $stop goes backwards"; - next if Date_Cmp($start, $stop) <= 0; - t "yup, it's a candidate"; - - my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ); - die if Date_Cmp($stop_next_day, $start) <= 0; - t 'if it were, stop time on next day would be: ' . d $stop_next_day; - - my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st'); - t '...and length of programme: ' . d $distance; - t 'shortest length so far: ' . d $shortest; - if (not defined $shortest or $distance < $shortest) { - t 'this is the best so far'; - $shortest = $distance; - $crossing_at = $i; - } + my ($start_hhmm, $stop_hhmm) = @{$todo[$i]}; + t '$start_hhmm=' . d $start_hhmm; + t '$stop_hhmm=' . d $stop_hhmm; + next if not defined $start_hhmm; + next if not defined $stop_hhmm; + + my $start = parse_local_date("$day $start_hhmm", $TZ); + my $stop = parse_local_date("$day $stop_hhmm", $TZ); + t "checking if $start -> $stop goes backwards"; + next if Date_Cmp($start, $stop) <= 0; + t "yup, it's a candidate"; + + my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ); + die if Date_Cmp($stop_next_day, $start) <= 0; + t 'if it were, stop time on next day would be: ' . d $stop_next_day; + + my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st'); + t '...and length of programme: ' . d $distance; + t 'shortest length so far: ' . d $shortest; + + if (not defined $shortest or $distance < $shortest) { + t 'this is the best so far'; + $shortest = $distance; + $crossing_at = $i; + } } + t '@todo=' . d \@todo; # Now given the place at which we cross from $day to $next_day we # can add the appropriate days to the hh:mm times. # if (not defined $crossing_at) { - push @$_, $day, $day foreach @todo; + push @$_, $day, $day foreach @todo; } else { - for (my $i = 0; $i < $crossing_at; ++$i) { - push @{$todo[$i]}, $day, $day; - } - for (my $i = $crossing_at) { - push @{$todo[$i]}, $day, $next_day; - } - for (my $i = $crossing_at + 1; $i < @todo; ++$i) { - push @{$todo[$i]}, $next_day, $next_day; - } + for (my $i = 0; $i < $crossing_at; ++$i) { + push @{$todo[$i]}, $day, $day; + } + for (my $i = $crossing_at) { + push @{$todo[$i]}, $day, $next_day; + } + for (my $i = $crossing_at + 1; $i < @todo; ++$i) { + push @{$todo[$i]}, $next_day, $next_day; + } } # Now we can parse the dates into Date::Manip objects. my @r; foreach (@todo) { - my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_; - my ($start, $stop); - if (defined $start_hhmm) { - $start = parse_local_date("$start_day $start_hhmm", $TZ); - } - if (defined $stop_hhmm) { - $stop = parse_local_date("$stop_day $stop_hhmm", $TZ); - } - push @r, [ $start, $stop, $title, $title_url ]; + my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_; + my ($start, $stop); + if (defined $start_hhmm) { + $start = parse_local_date("$start_day $start_hhmm", $TZ); + } + if (defined $stop_hhmm) { + $stop = parse_local_date("$stop_day $stop_hhmm", $TZ); + } + push @r, [ $start, $stop, $title, $title_url ]; } + t 'after parsing dates: ' . d \@r; # Check the dates and weed out those which are obviously wrong. my $last_start; foreach (@r) { - our ($start, $stop); - local (*start, *stop) = \ ($_->[0], $_->[1]); - t 'checking dates, $last_start=' . d $last_start; - t '$start=' . d $start; - t '$stop=' . d $stop; - if (defined $start and defined $stop - and Date_Cmp($start, $stop) > 0) { - # Appears to stop before it starts. Assume the stop time - # is bogus but the start time might be okay. - # - undef $stop; - } - if (defined $last_start) { - if (defined $start and Date_Cmp($start, $last_start) < 0) { - # Appears to start before previous start. - undef $start; - } + our ($start, $stop); + local (*start, *stop) = \ ($_->[0], $_->[1]); + t 'checking dates, $last_start=' . d $last_start; + t '$start=' . d $start; + t '$stop=' . d $stop; + + if (defined $start and defined $stop + and Date_Cmp($start, $stop) > 0) { + # Appears to stop before it starts. Assume the stop time + # is bogus but the start time might be okay. + # + undef $stop; + } + + if (defined $last_start) { + if (defined $start and Date_Cmp($start, $last_start) < 0) { + # Appears to start before previous start. + undef $start; + } if (defined $stop and Date_Cmp($stop, $last_start) < 0) { - # Stops before previous start - that's just as bad. - undef $stop; + # Stops before previous start - that's just as bad. + undef $stop; } } $last_start = $start if defined $start; @@ -1389,19 +1399,19 @@ # If there is a 'next page' link do a recursive call to handle it. foreach (@bits) { - my ($type, $href, $text) = @$_; - if ($type eq 'lijst_zender' and $text eq 'Volgende') { - # Next page for this day. - my $next_url = URI->new_abs($href, $url); - # Turn URL back into a string. - push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref) - } - elsif ($type eq 'lijst_zender' and $text eq 'Vorige') { - # Previous page for this day, assume already fetched. - } - else { - warn "discarding leftover $type: $text"; - } + my ($type, $href, $text) = @$_; + if ($type eq 'lijst_zender' and $text eq 'Volgende') { + # Next page for this day. + my $next_url = URI->new_abs($href, $url); + # Turn URL back into a string. + push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref) + } + elsif ($type eq 'lijst_zender' and $text eq 'Vorige') { + # Previous page for this day, assume already fetched. + } + else { + warn "discarding leftover $type: $text"; + } } t 'returning tuples: ' . d \@r; @@ -1448,13 +1458,13 @@ s/\bdecember\b/December/g; s/\bdec\b/December/g; - s/\bzondag\b/Sunday/g; - s/\bmaandag\b/Monday/g; - s/\bdinsdag\b/Tuesday/g; - s/\bwoensdag\b/Wednesday/g; - s/\bdonderdag\b/Thursday/g; - s/\bvrijdag\b/Friday/g; - s/\bzaterdag\b/Saturday/g; + s/\bZondag\b/Sunday/g; + s/\bMaandag\b/Monday/g; + s/\bDinsdag\b/Tuesday/g; + s/\bWoensdag\b/Wednesday/g; + s/\bDonderdag\b/Thursday/g; + s/\bVrijdag\b/Friday/g; + s/\bZaterdag\b/Saturday/g; my $r; eval { $r = parse_local_date($_, $TZ) };