Attachment 59889 Details for Bug 94101 – Fix for redesign of tvgids.nl

[patch] Fix for redesign of tvgids.nl

xmltv-0.5.39-tv_grab_nl-redesign.patch (text/plain), 49.91 KB, created by Maurice van der Pot (RETIRED) on 2005-05-26 11:12:38 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Maurice van der Pot (RETIRED)

Created: 2005-05-26 11:12:38 UTC

Size: 49.91 KB

patch

obsolete

>diff -ruN xmltv-0.5.39/grab/nl/tv_grab_nl.in xmltv-0.5.39-fixed/grab/nl/tv_grab_nl.in
>--- xmltv-0.5.39/grab/nl/tv_grab_nl.in	2004-11-28 23:31:01.000000000 +0100
>+++ xmltv-0.5.39-fixed/grab/nl/tv_grab_nl.in	2005-05-26 19:54:15.017352000 +0200
>@@ -53,8 +53,9 @@
> 
> =head1 AUTHOR
> 
>-Guido Diepen and Ed Avis (ed@membled.com).  Originally based on
>-tv_grab_fi by Matti Airas.
>+First version by Guido Diepen and Ed Avis (ed@membled.com).
>+Second version by Eric Bus (eric@fambus.nl).
>+Originally based on tv_grab_fi by Matti Airas.
> 
> =head1 BUGS
> 
>@@ -115,8 +116,7 @@
> }
> 
> # Function prototypes.
>-sub time_tot_str( $ );
>-sub time_van_str( $ );
>+sub time_to_str( $ );
> sub get_channels( $ );
> sub process_summary_page( $$$$ );
> sub parse_dutch_date( $ );
>@@ -328,12 +328,12 @@
> foreach $ch_did (@channels) {
>     my $ch_xid = "$ch_did.tvgids.nl";
>     for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) {
>-	my $url = 'http://www.tvgids.nl/zoekprogramma.php'
>-	  . "?station=$ch_did&interval=$i";
>-	my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d');
>-	die if not defined $day;
>-	die if ref $url;
>-	push @to_get, [ $url, $day, $ch_xid, $ch_did ];
>+    	my $url = 'http://www.tvgids.nl/zoeken/'
>+    	  . "?station=$ch_did&genre=&interval=$i&timeslot=0";
>+     	my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d');
>+    	die if not defined $day;
>+    	die if ref $url;
>+    	push @to_get, [ $url, $day, $ch_xid, $ch_did ];
>     }
> }
> 
>@@ -342,6 +342,7 @@
> my @summary_page_data;
> my $bar = new XMLTV::ProgressBar('downloading summary', scalar @to_get)
>   if not $opt_quiet;
>+
> foreach (@to_get) {
>     my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_;
>     die if ref $url;
>@@ -354,6 +355,7 @@
> 
> # Now we've fetched the descriptions, we know the channel names.
> foreach $ch_did (@channels) {
>+    print "processing channel #$ch_did\n";
>     $ch_name = $channels{$ch_did};
>     die "did not see any name for $ch_did in any listing pages"
>       if not defined $ch_name;
>@@ -364,6 +366,7 @@
> 
> my @summary_programmes;
> my %detail_url_to_summary_url;
>+
> foreach (@summary_page_data) {
>     my ($summary_url, $ch_xmltv_id, $data) = @$_;
> 
>@@ -373,142 +376,151 @@
>     #
>     my $clump;			# [ start, stop, programmes ].  stop may be undef.
>     my @clumps;
>+
>     foreach (@$data) {
>-	my ($start, $stop, $title, $url) = @$_;
>-	die if ref $url;
>-	local $SIG{__WARN__} = sub { warn "$url: $_[0]" };
>-	for ($detail_url_to_summary_url{$url}) {
>-	    warn "more than one programme with same details page $url"
>-	      if defined;
>-	    $_ = $summary_url;
>-	}
>+    	my ($start, $stop, $title, $url) = @$_;
>+    	die if ref $url;
>+    	local $SIG{__WARN__} = sub { warn "$url: $_[0]" };
>+    	for ($detail_url_to_summary_url{$url}) {
>+    	    warn "more than one programme with same details page $url"
>+    	      if defined;
>+    	    $_ = $summary_url;
>+    	}
> 	
>-	# Start and stop are common to a whole clump, but these
>-	# two are given individually for each programme.
>-	#
>-	my $details = [ $title, $url ];
>+    	# Start and stop are common to a whole clump, but these
>+    	# two are given individually for each programme.
>+    	#
>+    	my $details = [ $title, $url ];
> 	
>-	if (not defined $start) {
>-	    if (not $clump) {
>-		warn "programme '$title' at beginning of page has no start time, dropping\n";
>-		next;
>-	    }
>-	    t 'found programme with no start time';
>+    	if (not defined $start) {
>+    	    if (not $clump) {
>+        		warn "programme '$title' at beginning of page has no start time, dropping\n";
>+    	    	next;
>+    	    }
>+    	    t 'found programme with no start time';
> 	
>-	    if (defined $clump->[1]) {
>-		t 'make it start at stop of last clump';
>-		die if not defined $clump->[0];
>-		push @clumps, $clump;
>-		$clump = [ $clump->[1], $stop, [ $details ] ];
>-		die if not defined $clump->[0];
>-	    }
>-	    else {
>-		t 'current clump has no stop, add to clump';
>-		push @{$clump->[2]}, $details;
>-		t 'maybe set stop of current clump';
>-		$clump->[1] = $stop;
>-	    }
>-	}
>-	else {
>-	    t 'programme has start time, make new clump';
>-	    if ($clump) {
>-		die if not defined $clump->[0];
>-		my $cmp = Date_Cmp($clump->[0], $start);
>-		if ($cmp == 0) {
>-		    # Oddity in the web pages: this programme has the
>-		    # same start time as a previous one.  (Until I
>-		    # found this case, the clumping was only for cases
>-		    # where a programme lacked a start time, I think.)
>-		    # Anyway handle this by making it join the
>-		    # existing clump.
>-		    #
>-		    t 'same start time as existing clump, join it';
>-
>-		    if (defined $stop) {
>-			# Compare stop time of the current clump with
>-			# this programme and extend the clump if
>-			# necessary.
>-			#
>-			my $cmp = Date_Cmp($clump->[1], $stop);
>-			if ($cmp < 0) {
>-			    # Okay, later stop time, extend clump.
>-			    $clump->[1] = $stop;
>-			}
>-			elsif ($cmp == 0) {
>-			    # Okay.
>-			}
>-			elsif ($cmp > 0) {
>-			    warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n";
>-			}
>-			else { die }
>-		    }
>-
>-		    push @{$clump->[2]}, $details;
>-		}
>-		else {
>-		    # (Don't bother checking that $cmp < 0, it should
>-		    # be, but often programmes appear out of order...)
>-		    #
>-		    push @clumps, $clump;
>-		    $clump = [ $start, $stop, [ $details ] ];
>-		    t 'started a new clump for new start time, now: ' . d $clump;
>-		}
>-	    }
>-	    else {
>-		t 'no existing clump, starting one for this programme';
>-		$clump = [ $start, $stop, [ $details ] ];
>-	    }
>-	    die if not defined $clump->[0];
>-	}
>+    	    if (defined $clump->[1]) {
>+        		t 'make it start at stop of last clump';
>+        		die if not defined $clump->[0];
>+        		push @clumps, $clump;
>+        		$clump = [ $clump->[1], $stop, [ $details ] ];
>+        		die if not defined $clump->[0];
>+    	    }
>+    	    else {
>+        		t 'current clump has no stop, add to clump';
>+        		push @{$clump->[2]}, $details;
>+        		t 'maybe set stop of current clump';
>+        		$clump->[1] = $stop;
>+    	    }
>+    	}
>+    	else {
>+    	    t 'programme has start time, make new clump';
>+    	    if ($clump) {
>+        		die if not defined $clump->[0];
>+        		my $cmp = Date_Cmp($clump->[0], $start);
>+        
>+        		if ($cmp == 0) {
>+        		    # Oddity in the web pages: this programme has the
>+        		    # same start time as a previous one.  (Until I
>+        		    # found this case, the clumping was only for cases
>+        		    # where a programme lacked a start time, I think.)
>+        		    # Anyway handle this by making it join the
>+        		    # existing clump.
>+        		    #
>+        		    t 'same start time as existing clump, join it';
>+
>+        		    if (defined $stop) {
>+            			# Compare stop time of the current clump with
>+            			# this programme and extend the clump if
>+            			# necessary.
>+            			#
>+            			my $cmp = Date_Cmp($clump->[1], $stop);
>+            			if ($cmp < 0) {
>+            			    # Okay, later stop time, extend clump.
>+            			    $clump->[1] = $stop;
>+            			}
>+            			elsif ($cmp == 0) {
>+            			    # Okay.
>+            			}
>+            			elsif ($cmp > 0) {
>+    	    	    	    warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n";
>+        	    		}
>+            			else { die }
>+        		    }
>+
>+        		    push @{$clump->[2]}, $details;
>+        		}
>+        		else {
>+        		    # (Don't bother checking that $cmp < 0, it should
>+    	    	    # be, but often programmes appear out of order...)
>+        		    #
>+        		    push @clumps, $clump;
>+        		    $clump = [ $start, $stop, [ $details ] ];
>+        		    t 'started a new clump for new start time, now: ' . d $clump;
>+        		}
>+    	    }
>+    	    else {
>+        		t 'no existing clump, starting one for this programme';
>+        		$clump = [ $start, $stop, [ $details ] ];
>+    	    }
>+    
>+            die if not defined $clump->[0];
>+        }
>     }
>+
>     if ($clump) {
>-	die if not defined $clump->[0];
>-	push @clumps, $clump;
>+    	die if not defined $clump->[0];
>+    	push @clumps, $clump;
>     }
>+
>     t '\@clumps=' . d \@clumps;
>     
>-    # Now add the clumpidx attributes.
>-    foreach (@clumps) {
>-	my ($start, $stop, $l) = @$_;
>-	die if not defined $start;
>-	my $num_in_clump = @$l;
>-	if ($num_in_clump == 1) {
>-	    # Common case, no clumpidx needed.
>-	}
>-	elsif ($num_in_clump > 1) {
>-	    foreach my $i (0 .. $num_in_clump - 1) {
>-		# Add clumpidx as last thing in list.
>-		push @{$l->[$i]}, "$i/$num_in_clump";
>-	    }
>-	}
>-	else { die }
>+   # Now add the clumpidx attributes.
>+   foreach (@clumps) {
>+    	my ($start, $stop, $l) = @$_;
>+    	die if not defined $start;
>+    	my $num_in_clump = @$l;
>+    	if ($num_in_clump == 1) {
>+    	    # Common case, no clumpidx needed.
>+    	}
>+    	elsif ($num_in_clump > 1) {
>+    	    foreach my $i (0 .. $num_in_clump - 1) {
>+        		# Add clumpidx as last thing in list.
>+        		push @{$l->[$i]}, "$i/$num_in_clump";
>+    	    }
>+    	}
>+    	else { die }
>     }
>+
>     t 'after adding clumpidxes, \@clumps=' . d \@clumps;
>     
>     # Finally turn the data into programmes.
>     foreach (@clumps) {
>-	my ($start, $stop, $l) = @$_;
>-	die if not defined $start;
>-	foreach (@$l) {
>-	    my ($title, $url, $clumpidx) = @$_;
>-	    my %h = (channel => $ch_xmltv_id,
>-		     title => [ [ $title, $LANG ] ],
>-		    );
>-	    for (date_to_local($start, $TZ)) {
>-		$h{start} = UnixDate($_->[0], '%q') . " $_->[1]";
>+    	my ($start, $stop, $l) = @$_;
>+    	die if not defined $start;
>+    	foreach (@$l) {
>+    	    my ($title, $url, $clumpidx) = @$_;
>+    	    my %h = (channel => $ch_xmltv_id,
>+    		     title => [ [ $title, $LANG ] ],
>+        );
>+	
>+        for (date_to_local($start, $TZ)) {
>+    		$h{start} = UnixDate($_->[0], '%q') . " $_->[1]";
> 	    }
>+
> 	    if (defined $stop) {
>-		for (date_to_local($stop, $TZ)) {
>-		    $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]";
>-		}
>+    		for (date_to_local($stop, $TZ)) {
>+    		    $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]";
>+    		}
> 	    }
> 	    
> 	    if (defined $url) {
>-		die if ref $url;
>-		$h{url} = [ $url ];
>+    		die if ref $url;
>+    		$h{url} = [ $url ];
> 	    }
>-	    $h{clumpidx} = $clumpidx if defined $clumpidx;
>-	    push @summary_programmes, \%h;
>+
>+        $h{clumpidx} = $clumpidx if defined $clumpidx;
>+        push @summary_programmes, \%h;
> 	}
>     }
> }
>@@ -573,33 +585,33 @@
> 	    my $new = $detailed->{$_};
> 
> 	    if ($_ eq 'title') {
>-		# We know how to merge this.  TODO write general
>-		# XMLTV::Merge. 
>-		#
>-		my %already;
>-		foreach my $a (@$new) {
>-		    my $d = Dumper($a);
>-		    $already{$d}++ && warn "duplicate $_: $d";
>-		}
>-		foreach my $o (@$old) {
>-		    my $d = Dumper($o);
>-		    push @$new, $o unless $already{$d};
>-		}
>+    		# We know how to merge this.  TODO write general
>+    		# XMLTV::Merge. 
>+    		#
>+    		my %already;
>+    		foreach my $a (@$new) {
>+    		    my $d = Dumper($a);
>+    		    $already{$d}++ && warn "duplicate $_: $d";
>+    		}
>+    		foreach my $o (@$old) {
>+    		    my $d = Dumper($o);
>+    		    push @$new, $o unless $already{$d};
>+    		}
> 	    }
> 	    else {
>-		# Compare the two data structures.  For this to work
>-		# correctly it requires Data::Dumper 2.12 or later, as
>-		# shipped with perl 5.8.0.  Older versions don't
>-		# support $Sortkeys.  But we don't have any version
>-		# check here - in the worst case all that results from
>-		# using an older Data::Dumper is a few spurious
>-		# warning messages.
>-		#
>-		my $old_dump = Dumper($old);
>-		my $new_dump = Dumper($new);
>-		if ($old_dump ne $new_dump) {
>-		    warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n";
>-		}
>+    		# Compare the two data structures.  For this to work
>+    		# correctly it requires Data::Dumper 2.12 or later, as
>+    		# shipped with perl 5.8.0.  Older versions don't
>+    		# support $Sortkeys.  But we don't have any version
>+    		# check here - in the worst case all that results from
>+    		# using an older Data::Dumper is a few spurious
>+    		# warning messages.
>+    		#
>+    		my $old_dump = Dumper($old);
>+    		my $new_dump = Dumper($new);
>+    		if ($old_dump ne $new_dump) {
>+    		    warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n";
>+    		}
> 	    }
> 	}
> 
>@@ -629,20 +641,22 @@
> my $warned_bad_chars;
> sub tidy( $ ) {
>     for (my $tmp = shift) {
>-	tr/\221\222/''/;
>-	if (tr/\012\015\040-\176\240-\377//dc) {
>-	    warn 'removing bad characters' unless $warned_bad_chars++;
>-	}
>-	return $_;
>+    	tr/\221\222/''/;
>+    	if (tr/\012\015\040-\176\240-\377//dc) {
>+    	    warn 'removing bad characters' unless $warned_bad_chars++;
>+    	}
>+    	return $_;
>     }
> }
> 
> # Returns a programme hashref, or undef, or the magic 'END'.
> sub process_details_page( $$$ ) {
>     foreach (@_) { die if ref }
>+
>     my ($ch_xmltv_id, $url, $master_url) = @_;
>+
>     local $SIG{__WARN__} = sub {
>-	warn "$url (from $master_url): $_[0]";
>+    	warn "$url (from $master_url): $_[0]";
>     };
> 
>     # We make an HTML::TreeBuilder object, get the information
>@@ -650,298 +664,299 @@
>     #
>     my $t = new HTML::TreeBuilder();
>     eval {
>-	$t->parse(tidy(get_nice($url)));
>+    	$t->parse(tidy(get_nice($url)));
>     };
>+
>     if ($@) {
>-	warn "error getting/parsing $url: $@";
>-	return;
>+    	warn "error getting/parsing $url: $@";
>+    	return;
>     }
>-    my @elems = $t->look_down(class => 'detailDeel');
>-    if (not @elems) {
>-	warn "did not see any 'detailDeel' elements, skipping page";
>-	return;
>+
>+    my @elems = $t->look_down('id' => 'progDetail');
>+    if (not @elems or @elems != 1) {
>+    	warn "did not see one single 'progDetail' element, skipping page";
>+    	return;
>     }
>+    
>+    my $elem = $elems[0];
>+    my @hs = $elem->look_down('_tag' => 'h3');    
>+    if (not @hs or @hs != 2) {
>+        warn "could not find a valid 'h3' title, skipping page";
>+        return;
>+    }    
>+    
>+    my $naam = $hs[0]->as_text();
>+	$naam =~ s/^\s+//; $naam =~ s/\s+$//;
>+    
>+    my @desc;                            # accumulate bits
>+    my @ps = $elem->look_down('_tag' => 'p');
>+    if (not @ps) {
>+        warn "could not find a valid description, skipping page";
>+        return;
>+    }
>+   
>+    for( my $i=0; $i<=2; $i++ )
>+    {
>+        my $text = $ps[$i]->as_text();
>+        $text =~ s/^\s+//;
>+        $text =~ s/\s+$//;
>+        push @desc, $text if $text ne '';
>+    }
>+    
>+    @elems = $t->look_down('id' => 'data');
>+    if (not @elems or @elems != 1) {
>+    	warn "did not see one single 'data' element, skipping page";
>+    	return;
>+    }
>+    
>     my @info;
>-    foreach (@elems) {
>-	my @cont = grep { ref } $_->content_list();
>-	my $n = scalar @cont;
>-	if ($n != 2) {
>-	    warn "'detailDeel' has $n elements instead of 2";
>-	    next;
>-	}
>-	my ($k, $v) = @cont;
>-	for ($k->attr('class')) {
>-	    if (not defined or $_ ne 'detailLabel2') {
>-		warn "didn't see 'detailLabel2' in 'detailDeel'";
>-		next;
>-	    }
>-	}
>-	for ($v->attr('class')) {
>-	    if (not defined or $_ ne 'detailContent2') {
>-		warn "didn't see 'detailContent2' in 'detailDeel'";
>-		next;
>-	    }
>-	}
>-	push @info, [ $k->as_text(), $v->as_text() ];
>+    $elem = $elems[0];
>+    my @trs = $elem->look_down('_tag' => 'tr');
>+    if( not @trs ) {
>+        warn "did not find records below the 'data' element, skipping page";
>+        return;
>+    }
>+    
>+    foreach (@trs) {
>+        my $tr = $_;
>+        my @ths = $tr->look_down('_tag' => 'th');
>+        my @tds = $tr->look_down('_tag' => 'td');
>+        if( @ths == 1 && @tds == 1 ) {
>+        	push @info, [ $ths[0]->as_text(), $tds[0]->as_text() ];
>+        }
>     }
>     $t->delete(); undef $t;
> 
>     # Process the list of [ heading, data ] pairs.
>     my (
>-	# Exactly one:
>-	$van, $tot, $naam,
>+    	# Exactly one:
>+    	$van, $tot,
> 
>-	# At most one:
>+    	# At most one:
>         $director, $previously_shown, $orig_title, $sub_title, $genre,
>         $date, $episode_num, $actors, $writers, $commentators,
> 
>-	# Zero or more:
>+    	# Zero or more:
>         @presenter, @url,
>-       );
>+    );
>+
>     # NB 'at most one' $actors but that one entry can give several.
> 
>     my ($teletext_sub, $widescreen) = 0; # boolean
>-    my @desc;                            # accumulate bits
>     my $seen_tijdstip = 0;
>     my $last;
>-  ELEM: foreach (@info) {
>-	my ($regel, $text) = @$_;
>-	foreach ($regel, $text) {
>-	    s/^\s+//; s/\s+$//;
>-	}
>-
>-	if ($regel eq '') {
>-	    # Continuation of the previous one, hopefully.
>-	    $regel = $last;
>-	}
>-	else {
>-	    # They usually end with a colon but not always.
>-	    $regel =~ s/:$//;
>-	    $last = $regel;
>-	}
> 
>-	if ($regel eq 'Tijdstip') {
>-	    warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++;
>-	    if (length($text)<=16) {
>-		t "'onvolledig' is true, nothing more to write";
>-		t 'process_details_page() RETURNING';
>-		return 'END';
>-	    }
>-	    else {
>-		# Extract time strings from the text, but not full
>-		# Date::Manip objects.
>-		#
>-		$van = time_van_str($text);
>-		$tot = time_tot_str($text);
>-	    }
>-	}
>-	elsif ($regel eq 'Inhoud') {
>-	    # Empty text for this happens often, just skip it.
>-	    push @desc, $text if $text ne '';
>-	}
>-	elsif ($regel eq 'Programma') {
>-	    warn "seen 'Programma' twice\n" if defined $naam;
>-	    # FIXME should really look for 'herhaling' in italics.
>-	    if ($text =~ s/\bherhaling\s+van\s+(\d\d?)-(\d\d?)-(\d{4})//) {
>-		warn "seen previously-shown information twice\n"
>-		  if $previously_shown;
>-		my ($dd, $mm, $yyyy) = ($1, $2, $3);
>-		$previously_shown = { start => "$yyyy$mm$dd" };
>-		$text =~ s/^\s+//; $text =~ s/\s+$//;
>-	    }
>-	    elsif ($text =~ s/herhaling\b//) {
>-		# Repeat, but no previous date given.  NB
>-		# sometimes we see 'herhaling' without a space
>-		# before it, as in the redundant
>-		#
>-		# 'Netwerk herhalingenherhaling'
>-		#
>-		# Hence no \b at the start of the regexp.  We just
>-		# have to hope there aren't too many compound
>-		# words ending in 'herhaling'.
>-		#
>-		$previously_shown = {};
>-		$text =~ s/^\s+//; $text =~ s/\s+$//;
>-	    }
>-	    $naam = $text;
>-	}
>-	elsif ($regel eq 'Genre') {
>-	    warn "seen 'Genre' twice\n" if defined $genre;
>-	    # Empty text for this happens often, just skip it.
>-	    $genre = $text if $text ne '';
>-	}
>-	elsif ($regel eq 'Zender') {
>-	    # I think this means 'broadcaster' but the information
>-	    # is redundant because we already know the channel.
>-	    #
>-	    # Then we should check it and warn if it differs!  But
>-	    # it does differ - every programme on the channel
>-	    # Nederland 1, it seems, has Zender of 'Nederland
>-	    # 2'. So we just ignore this information.
>-	    #
>-	}
>-	elsif ($regel eq 'Omroep') {
>-	    # FIXME I don't know what this means (the dictionary
>-	    # says 'wireless telegraph' but that's no help) so
>-	    # just ignore it.
>-	    #
>-	}
>-	elsif ($regel eq 'Kenmerken') {
>-	    foreach (split /,\s*/, $text) {
>-		if ($_ eq 'Teletekst ondertiteld') {
>-		    # I'm guessing this means teletext subtitles :-).
>-		    $teletext_sub++
>-		      && warn 'seen teletext subtitles twice';
>-		}
>-		elsif ($_ eq 'Breedbeeld uitzending') {
>-		    $widescreen++ && warn 'seen widescreen twice';
>-		}
>-		elsif (length >= 50) {
>-		    # Some long sentence, part of description.
>-		    push @desc, $_;
>-		}
>-		else {
>-		    warn "unknown 'Kenmerken' bit $_"
>-		      unless $warned_regel{"Kenmerken: $_"}++;	
>-		    push @desc, $_;
>-		}
>-	    }
>-	}
>-	elsif ($regel eq 'Presentatie') {
>-	    push @presenter, $text;
>-	}
>-	elsif ($regel eq 'Afleverings nummer') {
>-	    warn "seen 'Afleverings nummer' twice"
>-	      if defined $episode_num;
>-	    if ($text eq 'Slot') {
>-		# The last episode of a series.  There isn't a way to
>-		# store this in the current XMLTV format.
>-		#
>-		warn "discarding 'Slot'" unless $warned_slot++;
>-	    }
>-	    elsif ($text =~ /^\d+$/) {
>-		if ($text == 0) {
>-		    warn "I thought episode nums on the site were from 1";
>-		}
>-		else {
>-		    $episode_num = $text - 1;
>-		}
>-	    }
>-	    elsif ($text =~ /^(?:\d+-)+\d$/) {
>-		# This means multiple episodes.  This ought to be
>-		# handled by turning the programme into a clump.
>-		#
>-		warn "programme covers multiple episodes ($text), not handled";
>-	    }
>-	    else {
>-		warn "bad episode number $text";
>-	    }
>-	}
>-	elsif ($regel eq 'Titel aflevering') {
>-	    warn "seen 'Titel aflevering' twice"
>-	      if defined $sub_title;
>-	    $sub_title = $text;
>-	}
>-	elsif ($regel eq 'Webpagina') {
>-	    # We have to turn the string given, which is normally
>-	    # just a hostname, into a URL.  I don't see why they
>-	    # don't just link to it directly, this is a web site
>-	    # after all.
>-	    #
>-	    # Anyway, the URI library doesn't seem to have any way
>-	    # to take a string and turn it into a URL adding
>-	    # 'http:' if necessary, so we do this by hand.
>-	    #
>-	    if ($text !~ tr/://) {
>-		$text = "http://$text";
>-	    }
>-	    push @url, $text;
>-	}
>-	elsif ($regel eq 'Rolverdeling') {
>-	    warn "seen 'Rolverdeling' twice" if $actors;
>-
>-	    # 'e.a' appearing in the description means 'and others';
>-	    # it's implicit in XMLTV that there might be other actors,
>-	    # so we quietly remove it.
>-	    #
>-	    $text =~ s/\s*e\.a\s*$//;
>-	
>-	    while (length $text) {
>-		if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) {
>-		    warn "discarding information about the parts played by each actor\n"
>-		      unless $warned_discarding_parts++;
>-		    push @$actors, $2;
>-		}
>-		elsif ($text =~ s/\s*([^,]+)(?:$|,)//) {
>-		    push @$actors, $1;
>-		}
>-		else {
>-		    warn "unknown remnant 'Rolverdeling' text '$text'";
>-		    last;
>-		}
>-	    }
>-	}
>-	elsif ($regel eq 'Scenario schrijver') {
>-	    warn "seen 'Scenario schrijver' twice" if $writers;
>-	    push @$writers, $text;
>-	}
>-	elsif ($regel eq 'E-mail') {
>-	    push @url, "mailto:$text";
>-	}
>-	elsif ($regel eq 'Bron') {
>-	    # FIXME cannot do anything special with this.  It
>-	    # means 'source' and perhaps by parsing the text we
>-	    # could find the names of writers or whatever.
>-	    #
>-	    push @desc, "$regel: $text";
>-	}
>-	elsif ($regel eq 'Commentaar') {
>-	    push @$commentators, $text;
>-	}
>-	elsif ($regel eq 'Jaar van premiere') {
>-	    # Year of release, I think.
>-	    warn "seen 'Jaar van premiere' twice"
>-	      if defined $date;
>-	    $date = $text;
>-	}
>-	elsif ($regel eq 'Regisseur') {
>-	    warn "seen 'Regisseur' twice" if defined $director;
>-	    $director = $text;
>-	}
>-	elsif ($regel eq 'Orginele titel') {
>-	    warn "seen 'Orginele titel' twice" if defined $orig_title;
>-	    $orig_title = $text;
>-	}
>-	elsif ($regel eq 'Behaalde prijzen') {
>-	    # Awards won.  It doesn't seem worth adding a separate
>-	    # field for this to the XMLTV format, just append to
>-	    # the description.
>-	    #
>-	    push @desc, "$regel: $text";
>-	}
>-	elsif ($regel eq 'Website') {
>-	    push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text";
>-	}
>-	else {
>-	    # Unknown key, but let's add it to the desc so we
>-	    # don't lose information.  These newlines are just for
>-	    # the benefit of someone reading the XML by hand.
>-	    #
>-	    push @desc, "$regel: $text";
>-	    warn "unknown programme info key $regel\n"
>-	      unless $warned_regel{$regel}++;
>-	}
>+    ELEM: foreach (@info) {
>+        my ($regel, $text) = @$_;
>+        foreach ($regel, $text) {
>+    	    s/^\s+//; s/\s+$//;
>+    	}
>+
>+    	if ($regel eq '') {
>+    	    # Continuation of the previous one, hopefully.
>+    	    $regel = $last;
>+    	}
>+    	else {
>+    	    # They usually end with a colon but not always.
>+    	    $regel =~ s/:$//;
>+    	    $last = $regel;
>+    	}
>+
>+    	if ($regel eq 'Datum en tijdstip') {
>+    	    warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++;
>+    	    if (length($text)<=16) {
>+        		t "'onvolledig' is true, nothing more to write";
>+        		t 'process_details_page() RETURNING';
>+        		return 'END';
>+    	    }
>+            else {
>+        		# Extract time strings from the text, but not full
>+        		# Date::Manip objects.
>+        		#
>+        		($van,$tot) = time_to_str($text);
>+	        }
>+    	}
>+    	elsif ($regel eq 'Inhoud') {
>+    	    # Empty text for this happens often, just skip it.
>+    	    push @desc, $text if $text ne '';
>+    	}
>+    	elsif ($regel eq 'Genre') {
>+    	    warn "seen 'Genre' twice\n" if defined $genre;
>+    	    # Empty text for this happens often, just skip it.
>+    	    $genre = $text if $text ne '';
>+    	}
>+    	elsif ($regel eq 'Zender') {
>+            # In the new layout, this field contains the logo
>+            # for the channel. Ignore it, because these logos
>+            # are too small to use in an application.
>+    	    #
>+    	}
>+    	elsif ($regel eq 'Omroep') {
>+            # We ignore this setting, because the XMLTV format
>+            # doesn't have room for this. In Holland, a few broadcasters
>+            # share the same channel.       
>+    	    #
>+    	}
>+    	elsif ($regel eq 'Bijzonderheden') {
>+    	    foreach (split /,\s*/, $text) {
>+        		if ($_ eq 'Teletekst ondertiteld') {
>+        		    # I'm guessing this means teletext subtitles :-).
>+        		    $teletext_sub++
>+        		      && warn 'seen teletext subtitles twice';
>+        		}
>+        		elsif ($_ eq 'Breedbeeld uitzending') {
>+        		    $widescreen++ && warn 'seen widescreen twice';
>+        		}
>+        		elsif (length >= 50) {
>+        		    # Some long sentence, part of description.
>+        		    push @desc, $_;
>+        		}
>+        		else {
>+        		    warn "unknown 'Bijzonderheden' bit $_"
>+        		      unless $warned_regel{"Bijzonderheden: $_"}++;	
>+        		    push @desc, $_;
>+        		}
>+    	    }
>+    	}
>+        elsif ($regel eq 'Teletekst') {
>+            # Teletekst contains the 'teletext' page for this programme
>+            # This information isn't used at the moment
>+        }
>+    	elsif ($regel eq 'Presentatie') {
>+    	    push @presenter, $text;
>+    	}
>+    	elsif ($regel eq 'Aflevering') {
>+    	    warn "seen 'Aflevering"
>+    	      if defined $episode_num;
>+    	    if ($text eq 'Slot') {
>+        		# The last episode of a series.  There isn't a way to
>+        		# store this in the current XMLTV format.
>+        		#
>+        		warn "discarding 'Slot'" unless $warned_slot++;
>+    	    }
>+    	    elsif ($text =~ /^\d+$/) {
>+        		if ($text == 0) {
>+        		    warn "I thought episode nums on the site were from 1";
>+        		}
>+    	    	else {
>+        		    $episode_num = $text - 1;
>+        		}
>+    	    }
>+    	    elsif ($text =~ /^(?:\d+-)+\d$/) {
>+    	    	# This means multiple episodes.  This ought to be
>+        		# handled by turning the programme into a clump.
>+        		#
>+        		warn "programme covers multiple episodes ($text), not handled";
>+    	    }
>+    	    else {
>+        		warn "bad episode number $text";
>+    	    }
>+    	}
>+    	elsif ($regel eq 'Titel aflevering') {
>+    	    warn "seen 'Titel aflevering' twice"
>+    	      if defined $sub_title;
>+    	    $sub_title = $text;
>+    	}
>+    	elsif ($regel eq 'Url') {
>+    	    # We have to turn the string given, which is normally
>+    	    # just a hostname, into a URL.  I don't see why they
>+    	    # don't just link to it directly, this is a web site
>+    	    # after all.
>+    	    #
>+    	    # Anyway, the URI library doesn't seem to have any way
>+    	    # to take a string and turn it into a URL adding
>+    	    # 'http:' if necessary, so we do this by hand.
>+    	    #
>+    	    if ($text !~ tr/://) {
>+        		$text = "http://$text";
>+    	    }
>+    	    push @url, $text;
>+    	}
>+    	elsif ($regel eq 'Acteurs') {
>+    	    warn "seen 'Rolverdeling' twice" if $actors;
>+    
>+    	    # 'e.a' appearing in the description means 'and others';
>+    	    # it's implicit in XMLTV that there might be other actors,
>+    	    # so we quietly remove it.
>+    	    #
>+    	    $text =~ s/\s*e\.a\s*$//;
>+    	
>+    	    while (length $text) {
>+        		if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) {
>+        		    warn "discarding information about the parts played by each actor\n"
>+        		      unless $warned_discarding_parts++;
>+        		    push @$actors, $2;
>+        		}
>+        		elsif ($text =~ s/\s*([^,]+)(?:$|,)//) {
>+        		    push @$actors, $1;
>+        		}
>+        		else {
>+        		    warn "unknown remnant 'Rolverdeling' text '$text'";
>+        		    last;
>+        		}
>+    	    }
>+    	}
>+    	elsif ($regel eq 'Scenario') {
>+    	    warn "seen 'Scenario' twice" if $writers;
>+    	    push @$writers, $text;
>+    	}
>+    	elsif ($regel eq 'Email') {
>+    	    push @url, "mailto:$text";
>+    	}
>+    	elsif ($regel eq 'Bron') {
>+    	    # FIXME cannot do anything special with this.  It
>+    	    # means 'source' and perhaps by parsing the text we
>+    	    # could find the names of writers or whatever.
>+    	    #
>+    	    # push @desc, "$regel: $text";
>+    	}
>+    	elsif ($regel eq 'Commentaar') {
>+    	    push @$commentators, $text;
>+    	}
>+    	elsif ($regel eq 'Jaar van premiere') {
>+    	    # Year of release, I think.
>+    	    warn "seen 'Jaar van premiere' twice"
>+    	      if defined $date;
>+    	    $date = $text;
>+    	}
>+    	elsif ($regel eq 'Regisseur') {
>+    	    warn "seen 'Regisseur' twice" if defined $director;
>+    	    $director = $text;
>+    	}
>+    	elsif ($regel eq 'Orginele titel') {
>+    	    warn "seen 'Orginele titel' twice" if defined $orig_title;
>+    	    $orig_title = $text;
>+    	}
>+    	elsif ($regel eq 'Behaalde prijzen') {
>+    	    # Awards won.  It doesn't seem worth adding a separate
>+    	    # field for this to the XMLTV format, just append to
>+    	    # the description.
>+    	    #
>+    	    # push @desc, "$regel: $text";
>+    	}
>+    	elsif ($regel eq 'Website') {
>+    	    push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text";
>+    	}
>+    	else {
>+    	    # Unknown key, report it back to the prompt
>+    	    #
>+    	    warn "unknown programme info key $regel\n"
>+    	      unless $warned_regel{$regel}++;
>+    	}
>     }
> 
>     if (not defined $naam) {
>-	warn "did not see programme title, skipping programme\n";
>-	return;
>+    	warn "did not see programme title, skipping programme\n";
>+    	return;
>     }
>     if (not defined $van) {
>-	warn "did not see programme times, skipping programme\n";
>-	return;
>+       	warn "did not find programme starttime, skipping programme\n";
>+    	return;
>+    }
>+    if (not defined $tot) {
>+       	warn "did not see programme endtime, skipping programme\n";
>+    	return;
>     }
>-    die if not defined $tot;
> 
>     my @title = ([ $naam, $LANG]);
>     push @title, [ $orig_title ] if defined $orig_title; # not Dutch!
>@@ -983,41 +998,32 @@
>     return \%prog;
> }
> 
>-sub time_tot_str( $ ) {
>+sub time_to_str( $ ) {
>     my $input = shift;
>-    if (length($input) == 15) {
>-	$input .= '05:00';
>-    }
>-    my $datum = substr($input,0,length($input)-11);
>-    my $tot   = substr($input,-5);
>-
>-    $datum =~ /(\d\d?)-(\d\d?)-(\d+)/
>-      or die "cannot find year in '$datum'";
>-    my ($dd, $mm, $yyyy) = ($1, $2, $3);
>-    foreach ($dd, $mm) { $_ = "0$_" if length == 1 }
>-    $tot =~ /(\d\d):(\d\d)/
>-      or die "cannot find time in '$tot'";
>-    my ($HH, $MM) = ($1, $2);
> 
>-    return "$yyyy-$mm-$dd $HH:$MM:00";
>-}
>-sub time_van_str( $ ) {
>-    my $input = shift;
>-    if (length($input) == 15) {
>-	$input .= '06:00';
>+    # Replace months
>+	$input =~ s/\bjanuari\b/1/g;
>+	$input =~ s/\bfebruari\b/2/g;
>+	$input =~ s/\bmaart\b/3/g;
>+	$input =~ s/\bapril\b/4/g;
>+	$input =~ s/\bmei\b/5/g;
>+	$input =~ s/\bjuni\b/6/g;
>+	$input =~ s/\bjuli\b/7/g;
>+	$input =~ s/\baugustus\b/8/g;
>+	$input =~ s/\bseptember\b/9/g;
>+	$input =~ s/\boktober\b/10/g;
>+	$input =~ s/\bnovember\b/11/g;
>+	$input =~ s/\bdecember\b/12/g;
>+    
>+    if( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d:\d\d)-(\d\d:\d\d) uur/ ) {
>+    	return ( "$3-$2-$1 $4:00", "$3-$2-$1 $5:00" );
>+    }
>+    elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), -(\d\d):(\d\d) uur/ ) {
>+    	return ( "$4-$3-$2 $4:".($5-30).":00", "$3-$2-$1 $4:$5:00" );
>+    }
>+    elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d):(\d\d)- uur/ ) {
>+    	return ( "$3-$2-$1 $4:$5:00", "$3-$2-$1 $4:".($5+30).":00" );
>     }
>-    my $datum = substr($input,0,length($input)-11);
>-    my $van = substr($input,-11);
>-    $van = substr($van,0,5);
>-
>-    $datum =~ /(\d\d?)-(\d\d?)-(\d{4})/
>-      or die "cannot find year in '$datum'";
>-    my ($dd, $mm, $yyyy) = ($1, $2, $3);
>-    foreach ($dd, $mm) { $_ = "0$_" if length == 1 }
>-    $van =~ /(\d\d):(\d\d)/
>-      or die "cannot find time in '$van'";
>-    my ($HH, $MM) = ($1, $2);
>-    return "$yyyy-$mm-$dd $HH:$MM:00";
> }
> 
> 
>@@ -1032,40 +1038,44 @@
>     my %channels;
> 
>     if ($sanity) {
>-	# Download what we can.
>-	my $url = 'http://www.tvgids.nl';
>-	#All stations are in the select box.
>-	#The station ID is the option value
>-	my $t = new HTML::TreeBuilder();
>-	$t->parse(get_nice($url));
>-	my @conts = map { [ $_->content_list() ] }
>-	  $t->look_down('_tag' => 'select', 'name' => 'station');
>-	foreach my $cont (@conts) {
>-	    my @children =@$cont;
>-	    if (scalar(@children) == 0) {
>-		warn 'No stations are defined';
>-		next;
>-	    }
>-	    foreach my $station_line (@children) {
>-		if ($station_line ne ' ') {
>-		    #This if statement is to prevent parsing the last
>-		    #empty element from the list.
>-			
>-		    my $channel_id = $station_line->attr('value');
>-		
>-		    #I am only interested in the normal channels.
>-		    #tvgids.nl has some pages for the regional stations also
>-		    #All normal channels have id <0,100>
>-		    #That is at the moment... Could change in future...
>-		    if ($channel_id > 0 && $channel_id < 100) {
>-			my $channel_name = $station_line->as_text();
>-			$channels{$channel_id} = $channel_name
>-		    }
>-		}
>-
>-	    }
>-	}	
>-	$t->delete(); undef $t;
>+    	# Download what we can.
>+    	my $url = 'http://www.tvgids.nl';
>+    	#All stations are in the select box.
>+    	#The station ID is the option value
>+    	my $t = new HTML::TreeBuilder();
>+    	$t->parse(get_nice($url));
>+    	my @conts = map { [ $_->content_list() ] }
>+    	  $t->look_down('_tag' => 'select', 'name' => 'station');
>+    
>+    	foreach my $cont (@conts) {
>+    	    my @children =@$cont;
>+    
>+    	    if (scalar(@children) == 0) {
>+        		warn 'No stations are defined';
>+    	    	next;
>+            }
>+        
>+    	    foreach my $station_line (@children) { 
>+        		if ($station_line ne ' ') {
>+    	    	    # This if statement is to prevent parsing the last
>+    		        # empty element from the list.
>+    			
>+        		    my $channel_id = $station_line->attr('value');
>+    		
>+    	    	    # I am only interested in the normal channels.
>+    		        # tvgids.nl has some pages for the regional stations also
>+        		    # All normal channels have id <0,300>
>+    	    	    # That is at the moment... Could change in future...
>+    		        if ($channel_id && $channel_id > 0 && $channel_id < 300) {
>+                        print $channel_id."\n";
>+        			    my $channel_name = $station_line->as_text();
>+            			$channels{$channel_id} = $channel_name
>+    	            }
>+        		}
>+    	    }
>+    	}	
>+    
>+    	$t->delete(); undef $t;
>     }
> 
>     # Now read the config file and check against any data we already
>@@ -1138,151 +1148,144 @@
> # or else drop it with a warning.
> #
> my %warned_bad_channel_name;
>+
> sub process_summary_page( $$$$ ) {
>     my ($url, $day, $official_day, $ch_name_ref) = @_;
>     die if not defined $url; die if ref $url;
>     die if not defined $day; die if ref $day;
>     die if not defined $official_day; die if ref $official_day;
>+
>     local $SIG{__WARN__} = sub {
>-	warn "$url: $_[0]";
>+    	warn "$url: $_[0]";
>     };
>+    
>     local $SIG{__DIE__} = sub {
>-	die "$url: $_[0]";
>+    	die "$url: $_[0]";
>     };
>+    
>     my $t = new HTML::TreeBuilder;
>     $t->parse(get_nice($url));
>-    my %interesting; ++ $interesting{$_} foreach
>-      qw(lijst_zender lijst_tijd details_programma);
>-    my @elems = $t->look_down(sub { die if not defined $_[0];
>-				    my $c = $_[0]->attr('class');
>-				    return 0 if not defined $c;
>-				    return $interesting{$c} });
>+    my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht');
>+
>     if (not @elems) {
>-	warn 'did not find any programmes in page';
>-	return ();
>+    	warn 'did not find any programmes in page';
>+    	return ();
>     }
>+
>     my @bits;
>+
>     foreach my $e (@elems) {
>-	my $class = $e->attr('class');
>-	t "looking at elem of class $class";
>-	my $href;
>-      check_content:
>-	my @cont = $e->content_list();
>-	my $got_href = $e->attr('href');
>-	if (defined $got_href) {
>-	    t "got href: $got_href";
>-	    warn "seen 'href's contained inside each other"
>-	      if defined $href;
>-	    $href = $got_href;
>-	}
>-	if (not @cont) {
>-	    warn "found $class elem without content, ignoring"
>-	      unless $class eq 'lijst_zender';
>-	    next;
>-	}
>-	elsif (@cont == 1) {
>-	    for ($cont[0]) {
>-		if (ref) {
>-		    # Unpack this extra layer of element.
>-		    $e = $_;
>-		    goto check_content;
>-		}
>-		s/^\s+//; s/\s+$//;
>-		push @bits, [ $class, $href, $_ ];
>-	    }
>-	}
>-	elsif (@cont > 1) {
>-	    warn "found $class elem with more than one elem inside, ignoring"
>-	      unless $class eq 'lijst_zender';
>-	    next;
>-	}
>+    	my @cont = $e->look_down('_tag' => 'tr');
>+        my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht');
>+
>+        if (not @elems) {
>+        	warn 'did not find any programmes in page';
>+        	return ();
>+        }
>     }
>-    $t->delete(); undef $t;
>+
>+    foreach my $e (@elems) {
>+       	my @h4elems = $e->look_down('_tag' => 'h4');
>+        if (not @h4elems) {
>+            warn 'did not find any date in page';
>+            return ();
>+        }
>+        else {
>+            push @bits, [ $h4elems[0]->as_text(), $_ ];
>+        }
>+
>+    	my @cont = $e->look_down('_tag' => 'tr');
>+        foreach my $tr (@cont) {
>+            # Fetch the time
>+            my @ths = $tr->look_down('_tag' => 'th');
>+            my @tds = $tr->look_down('_tag' => 'td');
>+            
>+            if( @ths == 1 and @tds > 1 ) {
>+                my $th = $ths[0]; my $td1 = $tds[0]; my $td2 = $tds[1];
>+                my @ahs = $td1->look_down('_tag' => 'a');
>+                if( @ahs == 1 ) {
>+                    push @bits, [ $th->as_text(), $td1->as_text(), $td2->as_text(), $ahs[0]->attr('href'), $_ ];
>+                }
>+            }
>+        }
>+
>+    }
>+
>+    $t->delete();
>+    undef $t;
>+    
>     if (not @bits) {
>-	warn "did not see any content, skipping page";
>-	return ();
>+    	warn "did not see any content, skipping page";
>+    	return ();
>     }
>+    
>     t 'got bits: ' . d @bits;
>-    if ($bits[0]->[0] eq 'lijst_zender') {
>-	my $date_str = $bits[0]->[2];
>-	my $d = parse_dutch_date($date_str);
>-	if (defined $d) {
>-	    my ($d_base, $d_tz) = @{date_to_local($d, $TZ)};
>-	    die "date in page $d_base ($date_str) doesn't match expected $official_day"
>-	      if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q');
>-	    shift @bits;
>-	}
>-	# otherwise, leave it for later processing
>-    }
>+    
>+   	my $date_str = $bits[0]->[0];
>+   	my $d = parse_dutch_date($date_str);
>+   	if (defined $d) {
>+   	    my ($d_base, $d_tz) = @{date_to_local($d, $TZ)};
>+   	    die "date in page $d_base ($date_str) doesn't match expected $official_day"
>+   	      if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q');
>+   	    shift @bits;
>+   	}
>+
>     my @todo;
>-    while (@bits >= 3) {
>-	my $ch_bit = shift @bits;
>-	my $ch_class = $ch_bit->[0];
>-	if ($ch_class ne 'lijst_zender') {
>-	    warn "bit expected to be channel name has class $ch_class not lijst_zender, skipping";
>-	    next;
>-	}
>-	my $ch = $ch_bit->[2];
>-	t 'shifted bit to get $ch=' . d $ch;
>-	for ($$ch_name_ref) {
>-	    if (not defined) { $_ = $ch }
>-	    elsif ($_ ne $ch) {
>-		# Most likely the channels file has the wrong name.
>-		warn "expected channel name $_, got $ch\n"
>-		  unless $warned_bad_channel_name{$_}{$ch}++;
>-		$_ = $ch;
>-	    }
>-	}
>+
>+    while (@bits > 0) {
>+    	my $bit = shift @bits;
>+
>+    	my $ch = $bit->[2];
>+    	t 'shifted bit to get $ch=' . d $ch;
>+    	for ($$ch_name_ref) {
>+    	    if (not defined) { $_ = $ch }
>+       	    elsif ($_ ne $ch) {
>+        		# Most likely the channels file has the wrong name.
>+        		warn "expected channel name $_, got $ch\n"
>+        		  unless $warned_bad_channel_name{$_}{$ch}++;
>+        		$_ = $ch;
>+    	    }
>+    	}
> 	
>-	my $times_bit = shift @bits;
>-	my $times_class = $times_bit->[0];
>-	if ($times_class ne 'lijst_tijd') {
>-	    warn "bit expected to be times has class $times_class not lijst_tijd, skipping";
>-	    next;
>-	}
>-	my $times = $times_bit->[2];
>-	t 'shifted bit to get $times=' . d $times;
>+    	my $times = $bit->[0];
>+    	t 'shifted bit to get $times=' . d $times;
>+    	
>+    	my $title_href = $bit->[3];
>+    	my $title = $bit->[1];
>+    	t 'shifted bit to get $title_href=' . d $title_href . ', '
>+    	  . '$title=' . d $title;
>+          
>+    	if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) {
>+    	    # A certain programme on the hour.  But it isn't worth
>+    	    # adding this to the output, ignore it.
>+    	    #
>+    	    next;
>+    	}
>+
>+    	if ($title eq 'NB: Programmering onder voorbehoud') {
>+    	    # Programming subject to change.  There isn't a way to
>+    	    # represent this in the current XMLTV format.
>+    	    #
>+    	    next;
>+    	}
> 	
>-	my $title_bit = shift @bits;
>-	my $title_class = $title_bit->[0];
>-	if ($title_class ne 'details_programma') {
>-	    warn "bit expected to be title has class $title_class not details_programma, skipping";
>-	    next;
>-	}
>-	my $title_href = $title_bit->[1];
>-	my $title = $title_bit->[2];
>-	t 'shifted bit to get $title_href=' . d $title_href . ', '
>-	  . '$title=' . d $title;
>-
>-	if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) {
>-	    # A certain programme on the hour.  But it isn't worth
>-	    # adding this to the output, ignore it.
>-	    #
>-	    next;
>-	}
>-	if ($title eq 'NB: Programmering onder voorbehoud') {
>-	    # Programming subject to change.  There isn't a way to
>-	    # represent this in the current XMLTV format.
>-	    #
>-	    next;
>-	}
>-	
>-	my ($start_hhmm, $stop_hhmm);
>-	if ($times =~ /^(\d\d):(\d\d)-/
>-	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
>-	    $start_hhmm = "$1:$2";
>-	}
>-	if ($times =~ /-(\d\d):(\d\d)$/
>-	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
>-	    $stop_hhmm = "$1:$2";
>-	}
>-
>-	# Right, got channel name, times, and title.
>-	# FIXME should check channel name (among other things).
>-	#
>-	my $title_url = URI->new_abs($title_href, $url);
>-	# We prefer to handle URLs as strings.
>-	push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ];
>+       	my ($start_hhmm, $stop_hhmm);
>+    	if ($times =~ /^(\d\d):(\d\d) - /
>+    	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
>+    	    $start_hhmm = "$1:$2";
>+    	}
>+    	if ($times =~ / - (\d\d):(\d\d)$/
>+    	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
>+    	    $stop_hhmm = "$1:$2";
>+    	}
>+
>+    	# Right, got channel name, times, and title.
>+    	# FIXME should check channel name (among other things).
>+    	#
>+       	my $title_url = URI->new_abs($title_href, $url);
>+        
>+       	# We prefer to handle URLs as strings.
>+       	push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ];
>     }
> 
>     # Now we need to make some sense of the times.  When stop time
>@@ -1296,91 +1299,98 @@
>     my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d');
>     my $crossing_at;
> #    local $Log::TraceMessages::On = 1;
>+
>     t 'looking at raw times and searching for midnight crossing: ' . d \@todo;
>+
>     for (my $i = 0; $i < @todo; ++$i) {
>-	my ($start_hhmm, $stop_hhmm) = @{$todo[$i]};
>-	t '$start_hhmm=' . d $start_hhmm;
>-	t '$stop_hhmm=' . d $stop_hhmm;
>-	next if not defined $start_hhmm;
>-	next if not defined $stop_hhmm;
>-
>-	my $start = parse_local_date("$day $start_hhmm", $TZ);
>-	my $stop = parse_local_date("$day $stop_hhmm", $TZ);
>-	t "checking if $start -> $stop goes backwards";
>-	next if Date_Cmp($start, $stop) <= 0;
>-	t "yup, it's a candidate";
>-
>-	my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ);
>-	die if Date_Cmp($stop_next_day, $start) <= 0;
>-	t 'if it were, stop time on next day would be: ' . d $stop_next_day;
>-
>-	my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st');
>-	t '...and length of programme: ' . d $distance;
>-	t 'shortest length so far: ' . d $shortest;
>-	if (not defined $shortest or $distance < $shortest) {
>-	    t 'this is the best so far';
>-	    $shortest = $distance;
>-	    $crossing_at = $i;
>-	}
>+    	my ($start_hhmm, $stop_hhmm) = @{$todo[$i]};
>+    	t '$start_hhmm=' . d $start_hhmm;
>+    	t '$stop_hhmm=' . d $stop_hhmm;
>+    	next if not defined $start_hhmm;
>+    	next if not defined $stop_hhmm;
>+
>+    	my $start = parse_local_date("$day $start_hhmm", $TZ);
>+    	my $stop = parse_local_date("$day $stop_hhmm", $TZ);
>+    	t "checking if $start -> $stop goes backwards";
>+    	next if Date_Cmp($start, $stop) <= 0;
>+    	t "yup, it's a candidate";
>+
>+    	my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ);
>+    	die if Date_Cmp($stop_next_day, $start) <= 0;
>+    	t 'if it were, stop time on next day would be: ' . d $stop_next_day;
>+
>+    	my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st');
>+    	t '...and length of programme: ' . d $distance;
>+    	t 'shortest length so far: ' . d $shortest;
>+    
>+    	if (not defined $shortest or $distance < $shortest) {
>+    	    t 'this is the best so far';
>+    	    $shortest = $distance;
>+    	    $crossing_at = $i;
>+    	}
>     }
>+
>     t '@todo=' . d \@todo;
> 
>     # Now given the place at which we cross from $day to $next_day we
>     # can add the appropriate days to the hh:mm times.
>     #
>     if (not defined $crossing_at) {
>-	push @$_, $day, $day foreach @todo;
>+    	push @$_, $day, $day foreach @todo;
>     }
>     else {
>-	for (my $i = 0; $i < $crossing_at; ++$i) {
>-	    push @{$todo[$i]}, $day, $day;
>-	}
>-	for (my $i = $crossing_at) {
>-	    push @{$todo[$i]}, $day, $next_day;
>-	}
>-	for (my $i = $crossing_at + 1; $i < @todo; ++$i) {
>-	    push @{$todo[$i]}, $next_day, $next_day;
>-	}
>+    	for (my $i = 0; $i < $crossing_at; ++$i) {
>+    	    push @{$todo[$i]}, $day, $day;
>+    	}
>+    	for (my $i = $crossing_at) {
>+    	    push @{$todo[$i]}, $day, $next_day;
>+    	}
>+    	for (my $i = $crossing_at + 1; $i < @todo; ++$i) {
>+    	    push @{$todo[$i]}, $next_day, $next_day;
>+    	}
>     }
> 
>     # Now we can parse the dates into Date::Manip objects.
>     my @r;
>     foreach (@todo) {
>-	my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_;
>-	my ($start, $stop);
>-	if (defined $start_hhmm) {
>-	    $start = parse_local_date("$start_day $start_hhmm", $TZ);
>-	}
>-	if (defined $stop_hhmm) {
>-	    $stop = parse_local_date("$stop_day $stop_hhmm", $TZ);
>-	}
>-	push @r, [ $start, $stop, $title, $title_url ];
>+    	my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_;
>+    	my ($start, $stop);
>+    	if (defined $start_hhmm) {
>+    	    $start = parse_local_date("$start_day $start_hhmm", $TZ);
>+	    }
>+    	if (defined $stop_hhmm) {
>+    	    $stop = parse_local_date("$stop_day $stop_hhmm", $TZ);
>+    	}
>+    	push @r, [ $start, $stop, $title, $title_url ];
>     }
>+
>     t 'after parsing dates: ' . d \@r;
> 
>     # Check the dates and weed out those which are obviously wrong.
>     my $last_start;
>     foreach (@r) {
>-	our ($start, $stop);
>-	local (*start, *stop) = \ ($_->[0], $_->[1]);
>-	t 'checking dates, $last_start=' . d $last_start;
>-	t '$start=' . d $start;
>-	t '$stop=' . d $stop;
>-	if (defined $start and defined $stop
>-	    and Date_Cmp($start, $stop) > 0) {
>-	    # Appears to stop before it starts.  Assume the stop time
>-	    # is bogus but the start time might be okay.
>-	    #
>-	    undef $stop;
>-	}
>-	if (defined $last_start) {
>-	    if (defined $start and Date_Cmp($start, $last_start) < 0) {
>-		# Appears to start before previous start.
>-		undef $start;
>-	    }
>+    	our ($start, $stop);
>+    	local (*start, *stop) = \ ($_->[0], $_->[1]);
>+    	t 'checking dates, $last_start=' . d $last_start;
>+    	t '$start=' . d $start;
>+    	t '$stop=' . d $stop;
>+
>+    	if (defined $start and defined $stop
>+	      and Date_Cmp($start, $stop) > 0) {
>+    	    # Appears to stop before it starts.  Assume the stop time
>+    	    # is bogus but the start time might be okay.
>+    	    #
>+    	    undef $stop;
>+    	}
>+    
>+    	if (defined $last_start) {
>+    	    if (defined $start and Date_Cmp($start, $last_start) < 0) {
>+    		# Appears to start before previous start.
>+    		undef $start;
>+        }
> 	    if (defined $stop and Date_Cmp($stop, $last_start) < 0) {
>-		# Stops before previous start - that's just as bad.
>-		undef $stop;
>+    		# Stops before previous start - that's just as bad.
>+    		undef $stop;
> 	    }
> 	}
> 	$last_start = $start if defined $start;
>@@ -1389,19 +1399,19 @@
> 
>     # If there is a 'next page' link do a recursive call to handle it.
>     foreach (@bits) {
>-	my ($type, $href, $text) = @$_;
>-	if ($type eq 'lijst_zender' and $text eq 'Volgende') {
>-	    # Next page for this day.
>-	    my $next_url = URI->new_abs($href, $url);
>-	    # Turn URL back into a string.
>-	    push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref)
>-	}
>-	elsif ($type eq 'lijst_zender' and $text eq 'Vorige') {
>-	    # Previous page for this day, assume already fetched.
>-	}
>-	else {
>-	    warn "discarding leftover $type: $text";
>-	}
>+        my ($type, $href, $text) = @$_;
>+       	if ($type eq 'lijst_zender' and $text eq 'Volgende') {
>+       	    # Next page for this day.
>+            my $next_url = URI->new_abs($href, $url);
>+      	    # Turn URL back into a string.
>+       	    push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref)
>+       	}
>+       	elsif ($type eq 'lijst_zender' and $text eq 'Vorige') {
>+       	    # Previous page for this day, assume already fetched.
>+       	}
>+       	else {
>+          	    warn "discarding leftover $type: $text";
>+       	}
>     }
> 
>     t 'returning tuples: ' . d \@r;
>@@ -1448,13 +1458,13 @@
> 	s/\bdecember\b/December/g;
> 	s/\bdec\b/December/g;
> 
>-	s/\bzondag\b/Sunday/g;
>-	s/\bmaandag\b/Monday/g;
>-	s/\bdinsdag\b/Tuesday/g;
>-	s/\bwoensdag\b/Wednesday/g;
>-	s/\bdonderdag\b/Thursday/g;
>-	s/\bvrijdag\b/Friday/g;
>-	s/\bzaterdag\b/Saturday/g;
>+	s/\bZondag\b/Sunday/g;
>+	s/\bMaandag\b/Monday/g;
>+	s/\bDinsdag\b/Tuesday/g;
>+	s/\bWoensdag\b/Wednesday/g;
>+	s/\bDonderdag\b/Thursday/g;
>+	s/\bVrijdag\b/Friday/g;
>+	s/\bZaterdag\b/Saturday/g;
> 
> 	my $r;
> 	eval { $r = parse_local_date($_, $TZ) };

Actions: View | Diff

Attachments on bug 94101: 59889