Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 94101
Collapse All | Expand All

(-)xmltv-0.5.39/grab/nl/tv_grab_nl.in (-693 / +703 lines)
Lines 53-60 Link Here
53
53
54
=head1 AUTHOR
54
=head1 AUTHOR
55
55
56
Guido Diepen and Ed Avis (ed@membled.com).  Originally based on
56
First version by Guido Diepen and Ed Avis (ed@membled.com).
57
tv_grab_fi by Matti Airas.
57
Second version by Eric Bus (eric@fambus.nl).
58
Originally based on tv_grab_fi by Matti Airas.
58
59
59
=head1 BUGS
60
=head1 BUGS
60
61
Lines 115-122 Link Here
115
}
116
}
116
117
117
# Function prototypes.
118
# Function prototypes.
118
sub time_tot_str( $ );
119
sub time_to_str( $ );
119
sub time_van_str( $ );
120
sub get_channels( $ );
120
sub get_channels( $ );
121
sub process_summary_page( $$$$ );
121
sub process_summary_page( $$$$ );
122
sub parse_dutch_date( $ );
122
sub parse_dutch_date( $ );
Lines 328-339 Link Here
328
foreach $ch_did (@channels) {
328
foreach $ch_did (@channels) {
329
    my $ch_xid = "$ch_did.tvgids.nl";
329
    my $ch_xid = "$ch_did.tvgids.nl";
330
    for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) {
330
    for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) {
331
	my $url = 'http://www.tvgids.nl/zoekprogramma.php'
331
    	my $url = 'http://www.tvgids.nl/zoeken/'
332
	  . "?station=$ch_did&interval=$i";
332
    	  . "?station=$ch_did&genre=&interval=$i&timeslot=0";
333
	my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d');
333
     	my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d');
334
	die if not defined $day;
334
    	die if not defined $day;
335
	die if ref $url;
335
    	die if ref $url;
336
	push @to_get, [ $url, $day, $ch_xid, $ch_did ];
336
    	push @to_get, [ $url, $day, $ch_xid, $ch_did ];
337
    }
337
    }
338
}
338
}
339
339
Lines 342-347 Link Here
342
my @summary_page_data;
342
my @summary_page_data;
343
my $bar = new XMLTV::ProgressBar('downloading summary', scalar @to_get)
343
my $bar = new XMLTV::ProgressBar('downloading summary', scalar @to_get)
344
  if not $opt_quiet;
344
  if not $opt_quiet;
345
345
foreach (@to_get) {
346
foreach (@to_get) {
346
    my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_;
347
    my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_;
347
    die if ref $url;
348
    die if ref $url;
Lines 354-359 Link Here
354
355
355
# Now we've fetched the descriptions, we know the channel names.
356
# Now we've fetched the descriptions, we know the channel names.
356
foreach $ch_did (@channels) {
357
foreach $ch_did (@channels) {
358
    print "processing channel #$ch_did\n";
357
    $ch_name = $channels{$ch_did};
359
    $ch_name = $channels{$ch_did};
358
    die "did not see any name for $ch_did in any listing pages"
360
    die "did not see any name for $ch_did in any listing pages"
359
      if not defined $ch_name;
361
      if not defined $ch_name;
Lines 364-369 Link Here
364
366
365
my @summary_programmes;
367
my @summary_programmes;
366
my %detail_url_to_summary_url;
368
my %detail_url_to_summary_url;
369
367
foreach (@summary_page_data) {
370
foreach (@summary_page_data) {
368
    my ($summary_url, $ch_xmltv_id, $data) = @$_;
371
    my ($summary_url, $ch_xmltv_id, $data) = @$_;
369
372
Lines 373-514 Link Here
373
    #
376
    #
374
    my $clump;			# [ start, stop, programmes ].  stop may be undef.
377
    my $clump;			# [ start, stop, programmes ].  stop may be undef.
375
    my @clumps;
378
    my @clumps;
379
376
    foreach (@$data) {
380
    foreach (@$data) {
377
	my ($start, $stop, $title, $url) = @$_;
381
    	my ($start, $stop, $title, $url) = @$_;
378
	die if ref $url;
382
    	die if ref $url;
379
	local $SIG{__WARN__} = sub { warn "$url: $_[0]" };
383
    	local $SIG{__WARN__} = sub { warn "$url: $_[0]" };
380
	for ($detail_url_to_summary_url{$url}) {
384
    	for ($detail_url_to_summary_url{$url}) {
381
	    warn "more than one programme with same details page $url"
385
    	    warn "more than one programme with same details page $url"
382
	      if defined;
386
    	      if defined;
383
	    $_ = $summary_url;
387
    	    $_ = $summary_url;
384
	}
388
    	}
385
	
389
	
386
	# Start and stop are common to a whole clump, but these
390
    	# Start and stop are common to a whole clump, but these
387
	# two are given individually for each programme.
391
    	# two are given individually for each programme.
388
	#
392
    	#
389
	my $details = [ $title, $url ];
393
    	my $details = [ $title, $url ];
390
	
394
	
391
	if (not defined $start) {
395
    	if (not defined $start) {
392
	    if (not $clump) {
396
    	    if (not $clump) {
393
		warn "programme '$title' at beginning of page has no start time, dropping\n";
397
        		warn "programme '$title' at beginning of page has no start time, dropping\n";
394
		next;
398
    	    	next;
395
	    }
399
    	    }
396
	    t 'found programme with no start time';
400
    	    t 'found programme with no start time';
397
	
401
	
398
	    if (defined $clump->[1]) {
402
    	    if (defined $clump->[1]) {
399
		t 'make it start at stop of last clump';
403
        		t 'make it start at stop of last clump';
400
		die if not defined $clump->[0];
404
        		die if not defined $clump->[0];
401
		push @clumps, $clump;
405
        		push @clumps, $clump;
402
		$clump = [ $clump->[1], $stop, [ $details ] ];
406
        		$clump = [ $clump->[1], $stop, [ $details ] ];
403
		die if not defined $clump->[0];
407
        		die if not defined $clump->[0];
404
	    }
408
    	    }
405
	    else {
409
    	    else {
406
		t 'current clump has no stop, add to clump';
410
        		t 'current clump has no stop, add to clump';
407
		push @{$clump->[2]}, $details;
411
        		push @{$clump->[2]}, $details;
408
		t 'maybe set stop of current clump';
412
        		t 'maybe set stop of current clump';
409
		$clump->[1] = $stop;
413
        		$clump->[1] = $stop;
410
	    }
414
    	    }
411
	}
415
    	}
412
	else {
416
    	else {
413
	    t 'programme has start time, make new clump';
417
    	    t 'programme has start time, make new clump';
414
	    if ($clump) {
418
    	    if ($clump) {
415
		die if not defined $clump->[0];
419
        		die if not defined $clump->[0];
416
		my $cmp = Date_Cmp($clump->[0], $start);
420
        		my $cmp = Date_Cmp($clump->[0], $start);
417
		if ($cmp == 0) {
421
        
418
		    # Oddity in the web pages: this programme has the
422
        		if ($cmp == 0) {
419
		    # same start time as a previous one.  (Until I
423
        		    # Oddity in the web pages: this programme has the
420
		    # found this case, the clumping was only for cases
424
        		    # same start time as a previous one.  (Until I
421
		    # where a programme lacked a start time, I think.)
425
        		    # found this case, the clumping was only for cases
422
		    # Anyway handle this by making it join the
426
        		    # where a programme lacked a start time, I think.)
423
		    # existing clump.
427
        		    # Anyway handle this by making it join the
424
		    #
428
        		    # existing clump.
425
		    t 'same start time as existing clump, join it';
429
        		    #
426
430
        		    t 'same start time as existing clump, join it';
427
		    if (defined $stop) {
431
428
			# Compare stop time of the current clump with
432
        		    if (defined $stop) {
429
			# this programme and extend the clump if
433
            			# Compare stop time of the current clump with
430
			# necessary.
434
            			# this programme and extend the clump if
431
			#
435
            			# necessary.
432
			my $cmp = Date_Cmp($clump->[1], $stop);
436
            			#
433
			if ($cmp < 0) {
437
            			my $cmp = Date_Cmp($clump->[1], $stop);
434
			    # Okay, later stop time, extend clump.
438
            			if ($cmp < 0) {
435
			    $clump->[1] = $stop;
439
            			    # Okay, later stop time, extend clump.
436
			}
440
            			    $clump->[1] = $stop;
437
			elsif ($cmp == 0) {
441
            			}
438
			    # Okay.
442
            			elsif ($cmp == 0) {
439
			}
443
            			    # Okay.
440
			elsif ($cmp > 0) {
444
            			}
441
			    warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n";
445
            			elsif ($cmp > 0) {
442
			}
446
    	    	    	    warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n";
443
			else { die }
447
        	    		}
444
		    }
448
            			else { die }
445
449
        		    }
446
		    push @{$clump->[2]}, $details;
450
447
		}
451
        		    push @{$clump->[2]}, $details;
448
		else {
452
        		}
449
		    # (Don't bother checking that $cmp < 0, it should
453
        		else {
450
		    # be, but often programmes appear out of order...)
454
        		    # (Don't bother checking that $cmp < 0, it should
451
		    #
455
    	    	    # be, but often programmes appear out of order...)
452
		    push @clumps, $clump;
456
        		    #
453
		    $clump = [ $start, $stop, [ $details ] ];
457
        		    push @clumps, $clump;
454
		    t 'started a new clump for new start time, now: ' . d $clump;
458
        		    $clump = [ $start, $stop, [ $details ] ];
455
		}
459
        		    t 'started a new clump for new start time, now: ' . d $clump;
456
	    }
460
        		}
457
	    else {
461
    	    }
458
		t 'no existing clump, starting one for this programme';
462
    	    else {
459
		$clump = [ $start, $stop, [ $details ] ];
463
        		t 'no existing clump, starting one for this programme';
460
	    }
464
        		$clump = [ $start, $stop, [ $details ] ];
461
	    die if not defined $clump->[0];
465
    	    }
462
	}
466
    
467
            die if not defined $clump->[0];
468
        }
463
    }
469
    }
470
464
    if ($clump) {
471
    if ($clump) {
465
	die if not defined $clump->[0];
472
    	die if not defined $clump->[0];
466
	push @clumps, $clump;
473
    	push @clumps, $clump;
467
    }
474
    }
475
468
    t '\@clumps=' . d \@clumps;
476
    t '\@clumps=' . d \@clumps;
469
    
477
    
470
    # Now add the clumpidx attributes.
478
   # Now add the clumpidx attributes.
471
    foreach (@clumps) {
479
   foreach (@clumps) {
472
	my ($start, $stop, $l) = @$_;
480
    	my ($start, $stop, $l) = @$_;
473
	die if not defined $start;
481
    	die if not defined $start;
474
	my $num_in_clump = @$l;
482
    	my $num_in_clump = @$l;
475
	if ($num_in_clump == 1) {
483
    	if ($num_in_clump == 1) {
476
	    # Common case, no clumpidx needed.
484
    	    # Common case, no clumpidx needed.
477
	}
485
    	}
478
	elsif ($num_in_clump > 1) {
486
    	elsif ($num_in_clump > 1) {
479
	    foreach my $i (0 .. $num_in_clump - 1) {
487
    	    foreach my $i (0 .. $num_in_clump - 1) {
480
		# Add clumpidx as last thing in list.
488
        		# Add clumpidx as last thing in list.
481
		push @{$l->[$i]}, "$i/$num_in_clump";
489
        		push @{$l->[$i]}, "$i/$num_in_clump";
482
	    }
490
    	    }
483
	}
491
    	}
484
	else { die }
492
    	else { die }
485
    }
493
    }
494
486
    t 'after adding clumpidxes, \@clumps=' . d \@clumps;
495
    t 'after adding clumpidxes, \@clumps=' . d \@clumps;
487
    
496
    
488
    # Finally turn the data into programmes.
497
    # Finally turn the data into programmes.
489
    foreach (@clumps) {
498
    foreach (@clumps) {
490
	my ($start, $stop, $l) = @$_;
499
    	my ($start, $stop, $l) = @$_;
491
	die if not defined $start;
500
    	die if not defined $start;
492
	foreach (@$l) {
501
    	foreach (@$l) {
493
	    my ($title, $url, $clumpidx) = @$_;
502
    	    my ($title, $url, $clumpidx) = @$_;
494
	    my %h = (channel => $ch_xmltv_id,
503
    	    my %h = (channel => $ch_xmltv_id,
495
		     title => [ [ $title, $LANG ] ],
504
    		     title => [ [ $title, $LANG ] ],
496
		    );
505
        );
497
	    for (date_to_local($start, $TZ)) {
506
	
498
		$h{start} = UnixDate($_->[0], '%q') . " $_->[1]";
507
        for (date_to_local($start, $TZ)) {
508
    		$h{start} = UnixDate($_->[0], '%q') . " $_->[1]";
499
	    }
509
	    }
510
500
	    if (defined $stop) {
511
	    if (defined $stop) {
501
		for (date_to_local($stop, $TZ)) {
512
    		for (date_to_local($stop, $TZ)) {
502
		    $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]";
513
    		    $h{stop} = UnixDate($_->[0], '%q') . " $_->[1]";
503
		}
514
    		}
504
	    }
515
	    }
505
	    
516
	    
506
	    if (defined $url) {
517
	    if (defined $url) {
507
		die if ref $url;
518
    		die if ref $url;
508
		$h{url} = [ $url ];
519
    		$h{url} = [ $url ];
509
	    }
520
	    }
510
	    $h{clumpidx} = $clumpidx if defined $clumpidx;
521
511
	    push @summary_programmes, \%h;
522
        $h{clumpidx} = $clumpidx if defined $clumpidx;
523
        push @summary_programmes, \%h;
512
	}
524
	}
513
    }
525
    }
514
}
526
}
Lines 573-605 Link Here
573
	    my $new = $detailed->{$_};
585
	    my $new = $detailed->{$_};
574
586
575
	    if ($_ eq 'title') {
587
	    if ($_ eq 'title') {
576
		# We know how to merge this.  TODO write general
588
    		# We know how to merge this.  TODO write general
577
		# XMLTV::Merge. 
589
    		# XMLTV::Merge. 
578
		#
590
    		#
579
		my %already;
591
    		my %already;
580
		foreach my $a (@$new) {
592
    		foreach my $a (@$new) {
581
		    my $d = Dumper($a);
593
    		    my $d = Dumper($a);
582
		    $already{$d}++ && warn "duplicate $_: $d";
594
    		    $already{$d}++ && warn "duplicate $_: $d";
583
		}
595
    		}
584
		foreach my $o (@$old) {
596
    		foreach my $o (@$old) {
585
		    my $d = Dumper($o);
597
    		    my $d = Dumper($o);
586
		    push @$new, $o unless $already{$d};
598
    		    push @$new, $o unless $already{$d};
587
		}
599
    		}
588
	    }
600
	    }
589
	    else {
601
	    else {
590
		# Compare the two data structures.  For this to work
602
    		# Compare the two data structures.  For this to work
591
		# correctly it requires Data::Dumper 2.12 or later, as
603
    		# correctly it requires Data::Dumper 2.12 or later, as
592
		# shipped with perl 5.8.0.  Older versions don't
604
    		# shipped with perl 5.8.0.  Older versions don't
593
		# support $Sortkeys.  But we don't have any version
605
    		# support $Sortkeys.  But we don't have any version
594
		# check here - in the worst case all that results from
606
    		# check here - in the worst case all that results from
595
		# using an older Data::Dumper is a few spurious
607
    		# using an older Data::Dumper is a few spurious
596
		# warning messages.
608
    		# warning messages.
597
		#
609
    		#
598
		my $old_dump = Dumper($old);
610
    		my $old_dump = Dumper($old);
599
		my $new_dump = Dumper($new);
611
    		my $new_dump = Dumper($new);
600
		if ($old_dump ne $new_dump) {
612
    		if ($old_dump ne $new_dump) {
601
		    warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n";
613
    		    warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n";
602
		}
614
    		}
603
	    }
615
	    }
604
	}
616
	}
605
617
Lines 629-648 Link Here
629
my $warned_bad_chars;
641
my $warned_bad_chars;
630
sub tidy( $ ) {
642
sub tidy( $ ) {
631
    for (my $tmp = shift) {
643
    for (my $tmp = shift) {
632
	tr/\221\222/''/;
644
    	tr/\221\222/''/;
633
	if (tr/\012\015\040-\176\240-\377//dc) {
645
    	if (tr/\012\015\040-\176\240-\377//dc) {
634
	    warn 'removing bad characters' unless $warned_bad_chars++;
646
    	    warn 'removing bad characters' unless $warned_bad_chars++;
635
	}
647
    	}
636
	return $_;
648
    	return $_;
637
    }
649
    }
638
}
650
}
639
651
640
# Returns a programme hashref, or undef, or the magic 'END'.
652
# Returns a programme hashref, or undef, or the magic 'END'.
641
sub process_details_page( $$$ ) {
653
sub process_details_page( $$$ ) {
642
    foreach (@_) { die if ref }
654
    foreach (@_) { die if ref }
655
643
    my ($ch_xmltv_id, $url, $master_url) = @_;
656
    my ($ch_xmltv_id, $url, $master_url) = @_;
657
644
    local $SIG{__WARN__} = sub {
658
    local $SIG{__WARN__} = sub {
645
	warn "$url (from $master_url): $_[0]";
659
    	warn "$url (from $master_url): $_[0]";
646
    };
660
    };
647
661
648
    # We make an HTML::TreeBuilder object, get the information
662
    # We make an HTML::TreeBuilder object, get the information
Lines 650-947 Link Here
650
    #
664
    #
651
    my $t = new HTML::TreeBuilder();
665
    my $t = new HTML::TreeBuilder();
652
    eval {
666
    eval {
653
	$t->parse(tidy(get_nice($url)));
667
    	$t->parse(tidy(get_nice($url)));
654
    };
668
    };
669
655
    if ($@) {
670
    if ($@) {
656
	warn "error getting/parsing $url: $@";
671
    	warn "error getting/parsing $url: $@";
657
	return;
672
    	return;
658
    }
673
    }
659
    my @elems = $t->look_down(class => 'detailDeel');
674
660
    if (not @elems) {
675
    my @elems = $t->look_down('id' => 'progDetail');
661
	warn "did not see any 'detailDeel' elements, skipping page";
676
    if (not @elems or @elems != 1) {
662
	return;
677
    	warn "did not see one single 'progDetail' element, skipping page";
678
    	return;
663
    }
679
    }
680
    
681
    my $elem = $elems[0];
682
    my @hs = $elem->look_down('_tag' => 'h3');    
683
    if (not @hs or @hs != 2) {
684
        warn "could not find a valid 'h3' title, skipping page";
685
        return;
686
    }    
687
    
688
    my $naam = $hs[0]->as_text();
689
	$naam =~ s/^\s+//; $naam =~ s/\s+$//;
690
    
691
    my @desc;                            # accumulate bits
692
    my @ps = $elem->look_down('_tag' => 'p');
693
    if (not @ps) {
694
        warn "could not find a valid description, skipping page";
695
        return;
696
    }
697
   
698
    for( my $i=0; $i<=2; $i++ )
699
    {
700
        my $text = $ps[$i]->as_text();
701
        $text =~ s/^\s+//;
702
        $text =~ s/\s+$//;
703
        push @desc, $text if $text ne '';
704
    }
705
    
706
    @elems = $t->look_down('id' => 'data');
707
    if (not @elems or @elems != 1) {
708
    	warn "did not see one single 'data' element, skipping page";
709
    	return;
710
    }
711
    
664
    my @info;
712
    my @info;
665
    foreach (@elems) {
713
    $elem = $elems[0];
666
	my @cont = grep { ref } $_->content_list();
714
    my @trs = $elem->look_down('_tag' => 'tr');
667
	my $n = scalar @cont;
715
    if( not @trs ) {
668
	if ($n != 2) {
716
        warn "did not find records below the 'data' element, skipping page";
669
	    warn "'detailDeel' has $n elements instead of 2";
717
        return;
670
	    next;
718
    }
671
	}
719
    
672
	my ($k, $v) = @cont;
720
    foreach (@trs) {
673
	for ($k->attr('class')) {
721
        my $tr = $_;
674
	    if (not defined or $_ ne 'detailLabel2') {
722
        my @ths = $tr->look_down('_tag' => 'th');
675
		warn "didn't see 'detailLabel2' in 'detailDeel'";
723
        my @tds = $tr->look_down('_tag' => 'td');
676
		next;
724
        if( @ths == 1 && @tds == 1 ) {
677
	    }
725
        	push @info, [ $ths[0]->as_text(), $tds[0]->as_text() ];
678
	}
726
        }
679
	for ($v->attr('class')) {
680
	    if (not defined or $_ ne 'detailContent2') {
681
		warn "didn't see 'detailContent2' in 'detailDeel'";
682
		next;
683
	    }
684
	}
685
	push @info, [ $k->as_text(), $v->as_text() ];
686
    }
727
    }
687
    $t->delete(); undef $t;
728
    $t->delete(); undef $t;
688
729
689
    # Process the list of [ heading, data ] pairs.
730
    # Process the list of [ heading, data ] pairs.
690
    my (
731
    my (
691
	# Exactly one:
732
    	# Exactly one:
692
	$van, $tot, $naam,
733
    	$van, $tot,
693
734
694
	# At most one:
735
    	# At most one:
695
        $director, $previously_shown, $orig_title, $sub_title, $genre,
736
        $director, $previously_shown, $orig_title, $sub_title, $genre,
696
        $date, $episode_num, $actors, $writers, $commentators,
737
        $date, $episode_num, $actors, $writers, $commentators,
697
738
698
	# Zero or more:
739
    	# Zero or more:
699
        @presenter, @url,
740
        @presenter, @url,
700
       );
741
    );
742
701
    # NB 'at most one' $actors but that one entry can give several.
743
    # NB 'at most one' $actors but that one entry can give several.
702
744
703
    my ($teletext_sub, $widescreen) = 0; # boolean
745
    my ($teletext_sub, $widescreen) = 0; # boolean
704
    my @desc;                            # accumulate bits
705
    my $seen_tijdstip = 0;
746
    my $seen_tijdstip = 0;
706
    my $last;
747
    my $last;
707
  ELEM: foreach (@info) {
708
	my ($regel, $text) = @$_;
709
	foreach ($regel, $text) {
710
	    s/^\s+//; s/\s+$//;
711
	}
712
713
	if ($regel eq '') {
714
	    # Continuation of the previous one, hopefully.
715
	    $regel = $last;
716
	}
717
	else {
718
	    # They usually end with a colon but not always.
719
	    $regel =~ s/:$//;
720
	    $last = $regel;
721
	}
722
748
723
	if ($regel eq 'Tijdstip') {
749
    ELEM: foreach (@info) {
724
	    warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++;
750
        my ($regel, $text) = @$_;
725
	    if (length($text)<=16) {
751
        foreach ($regel, $text) {
726
		t "'onvolledig' is true, nothing more to write";
752
    	    s/^\s+//; s/\s+$//;
727
		t 'process_details_page() RETURNING';
753
    	}
728
		return 'END';
754
729
	    }
755
    	if ($regel eq '') {
730
	    else {
756
    	    # Continuation of the previous one, hopefully.
731
		# Extract time strings from the text, but not full
757
    	    $regel = $last;
732
		# Date::Manip objects.
758
    	}
733
		#
759
    	else {
734
		$van = time_van_str($text);
760
    	    # They usually end with a colon but not always.
735
		$tot = time_tot_str($text);
761
    	    $regel =~ s/:$//;
736
	    }
762
    	    $last = $regel;
737
	}
763
    	}
738
	elsif ($regel eq 'Inhoud') {
764
739
	    # Empty text for this happens often, just skip it.
765
    	if ($regel eq 'Datum en tijdstip') {
740
	    push @desc, $text if $text ne '';
766
    	    warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++;
741
	}
767
    	    if (length($text)<=16) {
742
	elsif ($regel eq 'Programma') {
768
        		t "'onvolledig' is true, nothing more to write";
743
	    warn "seen 'Programma' twice\n" if defined $naam;
769
        		t 'process_details_page() RETURNING';
744
	    # FIXME should really look for 'herhaling' in italics.
770
        		return 'END';
745
	    if ($text =~ s/\bherhaling\s+van\s+(\d\d?)-(\d\d?)-(\d{4})//) {
771
    	    }
746
		warn "seen previously-shown information twice\n"
772
            else {
747
		  if $previously_shown;
773
        		# Extract time strings from the text, but not full
748
		my ($dd, $mm, $yyyy) = ($1, $2, $3);
774
        		# Date::Manip objects.
749
		$previously_shown = { start => "$yyyy$mm$dd" };
775
        		#
750
		$text =~ s/^\s+//; $text =~ s/\s+$//;
776
        		($van,$tot) = time_to_str($text);
751
	    }
777
	        }
752
	    elsif ($text =~ s/herhaling\b//) {
778
    	}
753
		# Repeat, but no previous date given.  NB
779
    	elsif ($regel eq 'Inhoud') {
754
		# sometimes we see 'herhaling' without a space
780
    	    # Empty text for this happens often, just skip it.
755
		# before it, as in the redundant
781
    	    push @desc, $text if $text ne '';
756
		#
782
    	}
757
		# 'Netwerk herhalingenherhaling'
783
    	elsif ($regel eq 'Genre') {
758
		#
784
    	    warn "seen 'Genre' twice\n" if defined $genre;
759
		# Hence no \b at the start of the regexp.  We just
785
    	    # Empty text for this happens often, just skip it.
760
		# have to hope there aren't too many compound
786
    	    $genre = $text if $text ne '';
761
		# words ending in 'herhaling'.
787
    	}
762
		#
788
    	elsif ($regel eq 'Zender') {
763
		$previously_shown = {};
789
            # In the new layout, this field contains the logo
764
		$text =~ s/^\s+//; $text =~ s/\s+$//;
790
            # for the channel. Ignore it, because these logos
765
	    }
791
            # are too small to use in an application.
766
	    $naam = $text;
792
    	    #
767
	}
793
    	}
768
	elsif ($regel eq 'Genre') {
794
    	elsif ($regel eq 'Omroep') {
769
	    warn "seen 'Genre' twice\n" if defined $genre;
795
            # We ignore this setting, because the XMLTV format
770
	    # Empty text for this happens often, just skip it.
796
            # doesn't have room for this. In Holland, a few broadcasters
771
	    $genre = $text if $text ne '';
797
            # share the same channel.       
772
	}
798
    	    #
773
	elsif ($regel eq 'Zender') {
799
    	}
774
	    # I think this means 'broadcaster' but the information
800
    	elsif ($regel eq 'Bijzonderheden') {
775
	    # is redundant because we already know the channel.
801
    	    foreach (split /,\s*/, $text) {
776
	    #
802
        		if ($_ eq 'Teletekst ondertiteld') {
777
	    # Then we should check it and warn if it differs!  But
803
        		    # I'm guessing this means teletext subtitles :-).
778
	    # it does differ - every programme on the channel
804
        		    $teletext_sub++
779
	    # Nederland 1, it seems, has Zender of 'Nederland
805
        		      && warn 'seen teletext subtitles twice';
780
	    # 2'. So we just ignore this information.
806
        		}
781
	    #
807
        		elsif ($_ eq 'Breedbeeld uitzending') {
782
	}
808
        		    $widescreen++ && warn 'seen widescreen twice';
783
	elsif ($regel eq 'Omroep') {
809
        		}
784
	    # FIXME I don't know what this means (the dictionary
810
        		elsif (length >= 50) {
785
	    # says 'wireless telegraph' but that's no help) so
811
        		    # Some long sentence, part of description.
786
	    # just ignore it.
812
        		    push @desc, $_;
787
	    #
813
        		}
788
	}
814
        		else {
789
	elsif ($regel eq 'Kenmerken') {
815
        		    warn "unknown 'Bijzonderheden' bit $_"
790
	    foreach (split /,\s*/, $text) {
816
        		      unless $warned_regel{"Bijzonderheden: $_"}++;	
791
		if ($_ eq 'Teletekst ondertiteld') {
817
        		    push @desc, $_;
792
		    # I'm guessing this means teletext subtitles :-).
818
        		}
793
		    $teletext_sub++
819
    	    }
794
		      && warn 'seen teletext subtitles twice';
820
    	}
795
		}
821
        elsif ($regel eq 'Teletekst') {
796
		elsif ($_ eq 'Breedbeeld uitzending') {
822
            # Teletekst contains the 'teletext' page for this programme
797
		    $widescreen++ && warn 'seen widescreen twice';
823
            # This information isn't used at the moment
798
		}
824
        }
799
		elsif (length >= 50) {
825
    	elsif ($regel eq 'Presentatie') {
800
		    # Some long sentence, part of description.
826
    	    push @presenter, $text;
801
		    push @desc, $_;
827
    	}
802
		}
828
    	elsif ($regel eq 'Aflevering') {
803
		else {
829
    	    warn "seen 'Aflevering"
804
		    warn "unknown 'Kenmerken' bit $_"
830
    	      if defined $episode_num;
805
		      unless $warned_regel{"Kenmerken: $_"}++;	
831
    	    if ($text eq 'Slot') {
806
		    push @desc, $_;
832
        		# The last episode of a series.  There isn't a way to
807
		}
833
        		# store this in the current XMLTV format.
808
	    }
834
        		#
809
	}
835
        		warn "discarding 'Slot'" unless $warned_slot++;
810
	elsif ($regel eq 'Presentatie') {
836
    	    }
811
	    push @presenter, $text;
837
    	    elsif ($text =~ /^\d+$/) {
812
	}
838
        		if ($text == 0) {
813
	elsif ($regel eq 'Afleverings nummer') {
839
        		    warn "I thought episode nums on the site were from 1";
814
	    warn "seen 'Afleverings nummer' twice"
840
        		}
815
	      if defined $episode_num;
841
    	    	else {
816
	    if ($text eq 'Slot') {
842
        		    $episode_num = $text - 1;
817
		# The last episode of a series.  There isn't a way to
843
        		}
818
		# store this in the current XMLTV format.
844
    	    }
819
		#
845
    	    elsif ($text =~ /^(?:\d+-)+\d$/) {
820
		warn "discarding 'Slot'" unless $warned_slot++;
846
    	    	# This means multiple episodes.  This ought to be
821
	    }
847
        		# handled by turning the programme into a clump.
822
	    elsif ($text =~ /^\d+$/) {
848
        		#
823
		if ($text == 0) {
849
        		warn "programme covers multiple episodes ($text), not handled";
824
		    warn "I thought episode nums on the site were from 1";
850
    	    }
825
		}
851
    	    else {
826
		else {
852
        		warn "bad episode number $text";
827
		    $episode_num = $text - 1;
853
    	    }
828
		}
854
    	}
829
	    }
855
    	elsif ($regel eq 'Titel aflevering') {
830
	    elsif ($text =~ /^(?:\d+-)+\d$/) {
856
    	    warn "seen 'Titel aflevering' twice"
831
		# This means multiple episodes.  This ought to be
857
    	      if defined $sub_title;
832
		# handled by turning the programme into a clump.
858
    	    $sub_title = $text;
833
		#
859
    	}
834
		warn "programme covers multiple episodes ($text), not handled";
860
    	elsif ($regel eq 'Url') {
835
	    }
861
    	    # We have to turn the string given, which is normally
836
	    else {
862
    	    # just a hostname, into a URL.  I don't see why they
837
		warn "bad episode number $text";
863
    	    # don't just link to it directly, this is a web site
838
	    }
864
    	    # after all.
839
	}
865
    	    #
840
	elsif ($regel eq 'Titel aflevering') {
866
    	    # Anyway, the URI library doesn't seem to have any way
841
	    warn "seen 'Titel aflevering' twice"
867
    	    # to take a string and turn it into a URL adding
842
	      if defined $sub_title;
868
    	    # 'http:' if necessary, so we do this by hand.
843
	    $sub_title = $text;
869
    	    #
844
	}
870
    	    if ($text !~ tr/://) {
845
	elsif ($regel eq 'Webpagina') {
871
        		$text = "http://$text";
846
	    # We have to turn the string given, which is normally
872
    	    }
847
	    # just a hostname, into a URL.  I don't see why they
873
    	    push @url, $text;
848
	    # don't just link to it directly, this is a web site
874
    	}
849
	    # after all.
875
    	elsif ($regel eq 'Acteurs') {
850
	    #
876
    	    warn "seen 'Rolverdeling' twice" if $actors;
851
	    # Anyway, the URI library doesn't seem to have any way
877
    
852
	    # to take a string and turn it into a URL adding
878
    	    # 'e.a' appearing in the description means 'and others';
853
	    # 'http:' if necessary, so we do this by hand.
879
    	    # it's implicit in XMLTV that there might be other actors,
854
	    #
880
    	    # so we quietly remove it.
855
	    if ($text !~ tr/://) {
881
    	    #
856
		$text = "http://$text";
882
    	    $text =~ s/\s*e\.a\s*$//;
857
	    }
883
    	
858
	    push @url, $text;
884
    	    while (length $text) {
859
	}
885
        		if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) {
860
	elsif ($regel eq 'Rolverdeling') {
886
        		    warn "discarding information about the parts played by each actor\n"
861
	    warn "seen 'Rolverdeling' twice" if $actors;
887
        		      unless $warned_discarding_parts++;
862
888
        		    push @$actors, $2;
863
	    # 'e.a' appearing in the description means 'and others';
889
        		}
864
	    # it's implicit in XMLTV that there might be other actors,
890
        		elsif ($text =~ s/\s*([^,]+)(?:$|,)//) {
865
	    # so we quietly remove it.
891
        		    push @$actors, $1;
866
	    #
892
        		}
867
	    $text =~ s/\s*e\.a\s*$//;
893
        		else {
868
	
894
        		    warn "unknown remnant 'Rolverdeling' text '$text'";
869
	    while (length $text) {
895
        		    last;
870
		if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) {
896
        		}
871
		    warn "discarding information about the parts played by each actor\n"
897
    	    }
872
		      unless $warned_discarding_parts++;
898
    	}
873
		    push @$actors, $2;
899
    	elsif ($regel eq 'Scenario') {
874
		}
900
    	    warn "seen 'Scenario' twice" if $writers;
875
		elsif ($text =~ s/\s*([^,]+)(?:$|,)//) {
901
    	    push @$writers, $text;
876
		    push @$actors, $1;
902
    	}
877
		}
903
    	elsif ($regel eq 'Email') {
878
		else {
904
    	    push @url, "mailto:$text";
879
		    warn "unknown remnant 'Rolverdeling' text '$text'";
905
    	}
880
		    last;
906
    	elsif ($regel eq 'Bron') {
881
		}
907
    	    # FIXME cannot do anything special with this.  It
882
	    }
908
    	    # means 'source' and perhaps by parsing the text we
883
	}
909
    	    # could find the names of writers or whatever.
884
	elsif ($regel eq 'Scenario schrijver') {
910
    	    #
885
	    warn "seen 'Scenario schrijver' twice" if $writers;
911
    	    # push @desc, "$regel: $text";
886
	    push @$writers, $text;
912
    	}
887
	}
913
    	elsif ($regel eq 'Commentaar') {
888
	elsif ($regel eq 'E-mail') {
914
    	    push @$commentators, $text;
889
	    push @url, "mailto:$text";
915
    	}
890
	}
916
    	elsif ($regel eq 'Jaar van premiere') {
891
	elsif ($regel eq 'Bron') {
917
    	    # Year of release, I think.
892
	    # FIXME cannot do anything special with this.  It
918
    	    warn "seen 'Jaar van premiere' twice"
893
	    # means 'source' and perhaps by parsing the text we
919
    	      if defined $date;
894
	    # could find the names of writers or whatever.
920
    	    $date = $text;
895
	    #
921
    	}
896
	    push @desc, "$regel: $text";
922
    	elsif ($regel eq 'Regisseur') {
897
	}
923
    	    warn "seen 'Regisseur' twice" if defined $director;
898
	elsif ($regel eq 'Commentaar') {
924
    	    $director = $text;
899
	    push @$commentators, $text;
925
    	}
900
	}
926
    	elsif ($regel eq 'Orginele titel') {
901
	elsif ($regel eq 'Jaar van premiere') {
927
    	    warn "seen 'Orginele titel' twice" if defined $orig_title;
902
	    # Year of release, I think.
928
    	    $orig_title = $text;
903
	    warn "seen 'Jaar van premiere' twice"
929
    	}
904
	      if defined $date;
930
    	elsif ($regel eq 'Behaalde prijzen') {
905
	    $date = $text;
931
    	    # Awards won.  It doesn't seem worth adding a separate
906
	}
932
    	    # field for this to the XMLTV format, just append to
907
	elsif ($regel eq 'Regisseur') {
933
    	    # the description.
908
	    warn "seen 'Regisseur' twice" if defined $director;
934
    	    #
909
	    $director = $text;
935
    	    # push @desc, "$regel: $text";
910
	}
936
    	}
911
	elsif ($regel eq 'Orginele titel') {
937
    	elsif ($regel eq 'Website') {
912
	    warn "seen 'Orginele titel' twice" if defined $orig_title;
938
    	    push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text";
913
	    $orig_title = $text;
939
    	}
914
	}
940
    	else {
915
	elsif ($regel eq 'Behaalde prijzen') {
941
    	    # Unknown key, report it back to the prompt
916
	    # Awards won.  It doesn't seem worth adding a separate
942
    	    #
917
	    # field for this to the XMLTV format, just append to
943
    	    warn "unknown programme info key $regel\n"
918
	    # the description.
944
    	      unless $warned_regel{$regel}++;
919
	    #
945
    	}
920
	    push @desc, "$regel: $text";
921
	}
922
	elsif ($regel eq 'Website') {
923
	    push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text";
924
	}
925
	else {
926
	    # Unknown key, but let's add it to the desc so we
927
	    # don't lose information.  These newlines are just for
928
	    # the benefit of someone reading the XML by hand.
929
	    #
930
	    push @desc, "$regel: $text";
931
	    warn "unknown programme info key $regel\n"
932
	      unless $warned_regel{$regel}++;
933
	}
934
    }
946
    }
935
947
936
    if (not defined $naam) {
948
    if (not defined $naam) {
937
	warn "did not see programme title, skipping programme\n";
949
    	warn "did not see programme title, skipping programme\n";
938
	return;
950
    	return;
939
    }
951
    }
940
    if (not defined $van) {
952
    if (not defined $van) {
941
	warn "did not see programme times, skipping programme\n";
953
       	warn "did not find programme starttime, skipping programme\n";
942
	return;
954
    	return;
955
    }
956
    if (not defined $tot) {
957
       	warn "did not see programme endtime, skipping programme\n";
958
    	return;
943
    }
959
    }
944
    die if not defined $tot;
945
960
946
    my @title = ([ $naam, $LANG]);
961
    my @title = ([ $naam, $LANG]);
947
    push @title, [ $orig_title ] if defined $orig_title; # not Dutch!
962
    push @title, [ $orig_title ] if defined $orig_title; # not Dutch!
Lines 983-1023 Link Here
983
    return \%prog;
998
    return \%prog;
984
}
999
}
985
1000
986
sub time_tot_str( $ ) {
1001
sub time_to_str( $ ) {
987
    my $input = shift;
1002
    my $input = shift;
988
    if (length($input) == 15) {
989
	$input .= '05:00';
990
    }
991
    my $datum = substr($input,0,length($input)-11);
992
    my $tot   = substr($input,-5);
993
994
    $datum =~ /(\d\d?)-(\d\d?)-(\d+)/
995
      or die "cannot find year in '$datum'";
996
    my ($dd, $mm, $yyyy) = ($1, $2, $3);
997
    foreach ($dd, $mm) { $_ = "0$_" if length == 1 }
998
    $tot =~ /(\d\d):(\d\d)/
999
      or die "cannot find time in '$tot'";
1000
    my ($HH, $MM) = ($1, $2);
1001
1003
1002
    return "$yyyy-$mm-$dd $HH:$MM:00";
1004
    # Replace months
1003
}
1005
	$input =~ s/\bjanuari\b/1/g;
1004
sub time_van_str( $ ) {
1006
	$input =~ s/\bfebruari\b/2/g;
1005
    my $input = shift;
1007
	$input =~ s/\bmaart\b/3/g;
1006
    if (length($input) == 15) {
1008
	$input =~ s/\bapril\b/4/g;
1007
	$input .= '06:00';
1009
	$input =~ s/\bmei\b/5/g;
1010
	$input =~ s/\bjuni\b/6/g;
1011
	$input =~ s/\bjuli\b/7/g;
1012
	$input =~ s/\baugustus\b/8/g;
1013
	$input =~ s/\bseptember\b/9/g;
1014
	$input =~ s/\boktober\b/10/g;
1015
	$input =~ s/\bnovember\b/11/g;
1016
	$input =~ s/\bdecember\b/12/g;
1017
    
1018
    if( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d:\d\d)-(\d\d:\d\d) uur/ ) {
1019
    	return ( "$3-$2-$1 $4:00", "$3-$2-$1 $5:00" );
1020
    }
1021
    elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), -(\d\d):(\d\d) uur/ ) {
1022
    	return ( "$4-$3-$2 $4:".($5-30).":00", "$3-$2-$1 $4:$5:00" );
1023
    }
1024
    elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d):(\d\d)- uur/ ) {
1025
    	return ( "$3-$2-$1 $4:$5:00", "$3-$2-$1 $4:".($5+30).":00" );
1008
    }
1026
    }
1009
    my $datum = substr($input,0,length($input)-11);
1010
    my $van = substr($input,-11);
1011
    $van = substr($van,0,5);
1012
1013
    $datum =~ /(\d\d?)-(\d\d?)-(\d{4})/
1014
      or die "cannot find year in '$datum'";
1015
    my ($dd, $mm, $yyyy) = ($1, $2, $3);
1016
    foreach ($dd, $mm) { $_ = "0$_" if length == 1 }
1017
    $van =~ /(\d\d):(\d\d)/
1018
      or die "cannot find time in '$van'";
1019
    my ($HH, $MM) = ($1, $2);
1020
    return "$yyyy-$mm-$dd $HH:$MM:00";
1021
}
1027
}
1022
1028
1023
1029
Lines 1032-1071 Link Here
1032
    my %channels;
1038
    my %channels;
1033
1039
1034
    if ($sanity) {
1040
    if ($sanity) {
1035
	# Download what we can.
1041
    	# Download what we can.
1036
	my $url = 'http://www.tvgids.nl';
1042
    	my $url = 'http://www.tvgids.nl';
1037
	#All stations are in the select box.
1043
    	#All stations are in the select box.
1038
	#The station ID is the option value
1044
    	#The station ID is the option value
1039
	my $t = new HTML::TreeBuilder();
1045
    	my $t = new HTML::TreeBuilder();
1040
	$t->parse(get_nice($url));
1046
    	$t->parse(get_nice($url));
1041
	my @conts = map { [ $_->content_list() ] }
1047
    	my @conts = map { [ $_->content_list() ] }
1042
	  $t->look_down('_tag' => 'select', 'name' => 'station');
1048
    	  $t->look_down('_tag' => 'select', 'name' => 'station');
1043
	foreach my $cont (@conts) {
1049
    
1044
	    my @children =@$cont;
1050
    	foreach my $cont (@conts) {
1045
	    if (scalar(@children) == 0) {
1051
    	    my @children =@$cont;
1046
		warn 'No stations are defined';
1052
    
1047
		next;
1053
    	    if (scalar(@children) == 0) {
1048
	    }
1054
        		warn 'No stations are defined';
1049
	    foreach my $station_line (@children) {
1055
    	    	next;
1050
		if ($station_line ne ' ') {
1056
            }
1051
		    #This if statement is to prevent parsing the last
1057
        
1052
		    #empty element from the list.
1058
    	    foreach my $station_line (@children) { 
1053
			
1059
        		if ($station_line ne ' ') {
1054
		    my $channel_id = $station_line->attr('value');
1060
    	    	    # This if statement is to prevent parsing the last
1055
		
1061
    		        # empty element from the list.
1056
		    #I am only interested in the normal channels.
1062
    			
1057
		    #tvgids.nl has some pages for the regional stations also
1063
        		    my $channel_id = $station_line->attr('value');
1058
		    #All normal channels have id <0,100>
1064
    		
1059
		    #That is at the moment... Could change in future...
1065
    	    	    # I am only interested in the normal channels.
1060
		    if ($channel_id > 0 && $channel_id < 100) {
1066
    		        # tvgids.nl has some pages for the regional stations also
1061
			my $channel_name = $station_line->as_text();
1067
        		    # All normal channels have id <0,300>
1062
			$channels{$channel_id} = $channel_name
1068
    	    	    # That is at the moment... Could change in future...
1063
		    }
1069
    		        if ($channel_id && $channel_id > 0 && $channel_id < 300) {
1064
		}
1070
                        print $channel_id."\n";
1065
1071
        			    my $channel_name = $station_line->as_text();
1066
	    }
1072
            			$channels{$channel_id} = $channel_name
1067
	}	
1073
    	            }
1068
	$t->delete(); undef $t;
1074
        		}
1075
    	    }
1076
    	}	
1077
    
1078
    	$t->delete(); undef $t;
1069
    }
1079
    }
1070
1080
1071
    # Now read the config file and check against any data we already
1081
    # Now read the config file and check against any data we already
Lines 1138-1288 Link Here
1138
# or else drop it with a warning.
1148
# or else drop it with a warning.
1139
#
1149
#
1140
my %warned_bad_channel_name;
1150
my %warned_bad_channel_name;
1151
1141
sub process_summary_page( $$$$ ) {
1152
sub process_summary_page( $$$$ ) {
1142
    my ($url, $day, $official_day, $ch_name_ref) = @_;
1153
    my ($url, $day, $official_day, $ch_name_ref) = @_;
1143
    die if not defined $url; die if ref $url;
1154
    die if not defined $url; die if ref $url;
1144
    die if not defined $day; die if ref $day;
1155
    die if not defined $day; die if ref $day;
1145
    die if not defined $official_day; die if ref $official_day;
1156
    die if not defined $official_day; die if ref $official_day;
1157
1146
    local $SIG{__WARN__} = sub {
1158
    local $SIG{__WARN__} = sub {
1147
	warn "$url: $_[0]";
1159
    	warn "$url: $_[0]";
1148
    };
1160
    };
1161
    
1149
    local $SIG{__DIE__} = sub {
1162
    local $SIG{__DIE__} = sub {
1150
	die "$url: $_[0]";
1163
    	die "$url: $_[0]";
1151
    };
1164
    };
1165
    
1152
    my $t = new HTML::TreeBuilder;
1166
    my $t = new HTML::TreeBuilder;
1153
    $t->parse(get_nice($url));
1167
    $t->parse(get_nice($url));
1154
    my %interesting; ++ $interesting{$_} foreach
1168
    my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht');
1155
      qw(lijst_zender lijst_tijd details_programma);
1169
1156
    my @elems = $t->look_down(sub { die if not defined $_[0];
1157
				    my $c = $_[0]->attr('class');
1158
				    return 0 if not defined $c;
1159
				    return $interesting{$c} });
1160
    if (not @elems) {
1170
    if (not @elems) {
1161
	warn 'did not find any programmes in page';
1171
    	warn 'did not find any programmes in page';
1162
	return ();
1172
    	return ();
1163
    }
1173
    }
1174
1164
    my @bits;
1175
    my @bits;
1176
1165
    foreach my $e (@elems) {
1177
    foreach my $e (@elems) {
1166
	my $class = $e->attr('class');
1178
    	my @cont = $e->look_down('_tag' => 'tr');
1167
	t "looking at elem of class $class";
1179
        my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht');
1168
	my $href;
1180
1169
      check_content:
1181
        if (not @elems) {
1170
	my @cont = $e->content_list();
1182
        	warn 'did not find any programmes in page';
1171
	my $got_href = $e->attr('href');
1183
        	return ();
1172
	if (defined $got_href) {
1184
        }
1173
	    t "got href: $got_href";
1174
	    warn "seen 'href's contained inside each other"
1175
	      if defined $href;
1176
	    $href = $got_href;
1177
	}
1178
	if (not @cont) {
1179
	    warn "found $class elem without content, ignoring"
1180
	      unless $class eq 'lijst_zender';
1181
	    next;
1182
	}
1183
	elsif (@cont == 1) {
1184
	    for ($cont[0]) {
1185
		if (ref) {
1186
		    # Unpack this extra layer of element.
1187
		    $e = $_;
1188
		    goto check_content;
1189
		}
1190
		s/^\s+//; s/\s+$//;
1191
		push @bits, [ $class, $href, $_ ];
1192
	    }
1193
	}
1194
	elsif (@cont > 1) {
1195
	    warn "found $class elem with more than one elem inside, ignoring"
1196
	      unless $class eq 'lijst_zender';
1197
	    next;
1198
	}
1199
    }
1185
    }
1200
    $t->delete(); undef $t;
1186
1187
    foreach my $e (@elems) {
1188
       	my @h4elems = $e->look_down('_tag' => 'h4');
1189
        if (not @h4elems) {
1190
            warn 'did not find any date in page';
1191
            return ();
1192
        }
1193
        else {
1194
            push @bits, [ $h4elems[0]->as_text(), $_ ];
1195
        }
1196
1197
    	my @cont = $e->look_down('_tag' => 'tr');
1198
        foreach my $tr (@cont) {
1199
            # Fetch the time
1200
            my @ths = $tr->look_down('_tag' => 'th');
1201
            my @tds = $tr->look_down('_tag' => 'td');
1202
            
1203
            if( @ths == 1 and @tds > 1 ) {
1204
                my $th = $ths[0]; my $td1 = $tds[0]; my $td2 = $tds[1];
1205
                my @ahs = $td1->look_down('_tag' => 'a');
1206
                if( @ahs == 1 ) {
1207
                    push @bits, [ $th->as_text(), $td1->as_text(), $td2->as_text(), $ahs[0]->attr('href'), $_ ];
1208
                }
1209
            }
1210
        }
1211
1212
    }
1213
1214
    $t->delete();
1215
    undef $t;
1216
    
1201
    if (not @bits) {
1217
    if (not @bits) {
1202
	warn "did not see any content, skipping page";
1218
    	warn "did not see any content, skipping page";
1203
	return ();
1219
    	return ();
1204
    }
1220
    }
1221
    
1205
    t 'got bits: ' . d @bits;
1222
    t 'got bits: ' . d @bits;
1206
    if ($bits[0]->[0] eq 'lijst_zender') {
1223
    
1207
	my $date_str = $bits[0]->[2];
1224
   	my $date_str = $bits[0]->[0];
1208
	my $d = parse_dutch_date($date_str);
1225
   	my $d = parse_dutch_date($date_str);
1209
	if (defined $d) {
1226
   	if (defined $d) {
1210
	    my ($d_base, $d_tz) = @{date_to_local($d, $TZ)};
1227
   	    my ($d_base, $d_tz) = @{date_to_local($d, $TZ)};
1211
	    die "date in page $d_base ($date_str) doesn't match expected $official_day"
1228
   	    die "date in page $d_base ($date_str) doesn't match expected $official_day"
1212
	      if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q');
1229
   	      if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q');
1213
	    shift @bits;
1230
   	    shift @bits;
1214
	}
1231
   	}
1215
	# otherwise, leave it for later processing
1232
1216
    }
1217
    my @todo;
1233
    my @todo;
1218
    while (@bits >= 3) {
1234
1219
	my $ch_bit = shift @bits;
1235
    while (@bits > 0) {
1220
	my $ch_class = $ch_bit->[0];
1236
    	my $bit = shift @bits;
1221
	if ($ch_class ne 'lijst_zender') {
1237
1222
	    warn "bit expected to be channel name has class $ch_class not lijst_zender, skipping";
1238
    	my $ch = $bit->[2];
1223
	    next;
1239
    	t 'shifted bit to get $ch=' . d $ch;
1224
	}
1240
    	for ($$ch_name_ref) {
1225
	my $ch = $ch_bit->[2];
1241
    	    if (not defined) { $_ = $ch }
1226
	t 'shifted bit to get $ch=' . d $ch;
1242
       	    elsif ($_ ne $ch) {
1227
	for ($$ch_name_ref) {
1243
        		# Most likely the channels file has the wrong name.
1228
	    if (not defined) { $_ = $ch }
1244
        		warn "expected channel name $_, got $ch\n"
1229
	    elsif ($_ ne $ch) {
1245
        		  unless $warned_bad_channel_name{$_}{$ch}++;
1230
		# Most likely the channels file has the wrong name.
1246
        		$_ = $ch;
1231
		warn "expected channel name $_, got $ch\n"
1247
    	    }
1232
		  unless $warned_bad_channel_name{$_}{$ch}++;
1248
    	}
1233
		$_ = $ch;
1234
	    }
1235
	}
1236
	
1249
	
1237
	my $times_bit = shift @bits;
1250
    	my $times = $bit->[0];
1238
	my $times_class = $times_bit->[0];
1251
    	t 'shifted bit to get $times=' . d $times;
1239
	if ($times_class ne 'lijst_tijd') {
1252
    	
1240
	    warn "bit expected to be times has class $times_class not lijst_tijd, skipping";
1253
    	my $title_href = $bit->[3];
1241
	    next;
1254
    	my $title = $bit->[1];
1242
	}
1255
    	t 'shifted bit to get $title_href=' . d $title_href . ', '
1243
	my $times = $times_bit->[2];
1256
    	  . '$title=' . d $title;
1244
	t 'shifted bit to get $times=' . d $times;
1257
          
1258
    	if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) {
1259
    	    # A certain programme on the hour.  But it isn't worth
1260
    	    # adding this to the output, ignore it.
1261
    	    #
1262
    	    next;
1263
    	}
1264
1265
    	if ($title eq 'NB: Programmering onder voorbehoud') {
1266
    	    # Programming subject to change.  There isn't a way to
1267
    	    # represent this in the current XMLTV format.
1268
    	    #
1269
    	    next;
1270
    	}
1245
	
1271
	
1246
	my $title_bit = shift @bits;
1272
       	my ($start_hhmm, $stop_hhmm);
1247
	my $title_class = $title_bit->[0];
1273
    	if ($times =~ /^(\d\d):(\d\d) - /
1248
	if ($title_class ne 'details_programma') {
1274
    	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
1249
	    warn "bit expected to be title has class $title_class not details_programma, skipping";
1275
    	    $start_hhmm = "$1:$2";
1250
	    next;
1276
    	}
1251
	}
1277
    	if ($times =~ / - (\d\d):(\d\d)$/
1252
	my $title_href = $title_bit->[1];
1278
    	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
1253
	my $title = $title_bit->[2];
1279
    	    $stop_hhmm = "$1:$2";
1254
	t 'shifted bit to get $title_href=' . d $title_href . ', '
1280
    	}
1255
	  . '$title=' . d $title;
1281
1256
1282
    	# Right, got channel name, times, and title.
1257
	if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) {
1283
    	# FIXME should check channel name (among other things).
1258
	    # A certain programme on the hour.  But it isn't worth
1284
    	#
1259
	    # adding this to the output, ignore it.
1285
       	my $title_url = URI->new_abs($title_href, $url);
1260
	    #
1286
        
1261
	    next;
1287
       	# We prefer to handle URLs as strings.
1262
	}
1288
       	push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ];
1263
	if ($title eq 'NB: Programmering onder voorbehoud') {
1264
	    # Programming subject to change.  There isn't a way to
1265
	    # represent this in the current XMLTV format.
1266
	    #
1267
	    next;
1268
	}
1269
	
1270
	my ($start_hhmm, $stop_hhmm);
1271
	if ($times =~ /^(\d\d):(\d\d)-/
1272
	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
1273
	    $start_hhmm = "$1:$2";
1274
	}
1275
	if ($times =~ /-(\d\d):(\d\d)$/
1276
	    and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) {
1277
	    $stop_hhmm = "$1:$2";
1278
	}
1279
1280
	# Right, got channel name, times, and title.
1281
	# FIXME should check channel name (among other things).
1282
	#
1283
	my $title_url = URI->new_abs($title_href, $url);
1284
	# We prefer to handle URLs as strings.
1285
	push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ];
1286
    }
1289
    }
1287
1290
1288
    # Now we need to make some sense of the times.  When stop time
1291
    # Now we need to make some sense of the times.  When stop time
Lines 1296-1386 Link Here
1296
    my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d');
1299
    my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d');
1297
    my $crossing_at;
1300
    my $crossing_at;
1298
#    local $Log::TraceMessages::On = 1;
1301
#    local $Log::TraceMessages::On = 1;
1302
1299
    t 'looking at raw times and searching for midnight crossing: ' . d \@todo;
1303
    t 'looking at raw times and searching for midnight crossing: ' . d \@todo;
1304
1300
    for (my $i = 0; $i < @todo; ++$i) {
1305
    for (my $i = 0; $i < @todo; ++$i) {
1301
	my ($start_hhmm, $stop_hhmm) = @{$todo[$i]};
1306
    	my ($start_hhmm, $stop_hhmm) = @{$todo[$i]};
1302
	t '$start_hhmm=' . d $start_hhmm;
1307
    	t '$start_hhmm=' . d $start_hhmm;
1303
	t '$stop_hhmm=' . d $stop_hhmm;
1308
    	t '$stop_hhmm=' . d $stop_hhmm;
1304
	next if not defined $start_hhmm;
1309
    	next if not defined $start_hhmm;
1305
	next if not defined $stop_hhmm;
1310
    	next if not defined $stop_hhmm;
1306
1311
1307
	my $start = parse_local_date("$day $start_hhmm", $TZ);
1312
    	my $start = parse_local_date("$day $start_hhmm", $TZ);
1308
	my $stop = parse_local_date("$day $stop_hhmm", $TZ);
1313
    	my $stop = parse_local_date("$day $stop_hhmm", $TZ);
1309
	t "checking if $start -> $stop goes backwards";
1314
    	t "checking if $start -> $stop goes backwards";
1310
	next if Date_Cmp($start, $stop) <= 0;
1315
    	next if Date_Cmp($start, $stop) <= 0;
1311
	t "yup, it's a candidate";
1316
    	t "yup, it's a candidate";
1312
1317
1313
	my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ);
1318
    	my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ);
1314
	die if Date_Cmp($stop_next_day, $start) <= 0;
1319
    	die if Date_Cmp($stop_next_day, $start) <= 0;
1315
	t 'if it were, stop time on next day would be: ' . d $stop_next_day;
1320
    	t 'if it were, stop time on next day would be: ' . d $stop_next_day;
1316
1321
1317
	my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st');
1322
    	my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st');
1318
	t '...and length of programme: ' . d $distance;
1323
    	t '...and length of programme: ' . d $distance;
1319
	t 'shortest length so far: ' . d $shortest;
1324
    	t 'shortest length so far: ' . d $shortest;
1320
	if (not defined $shortest or $distance < $shortest) {
1325
    
1321
	    t 'this is the best so far';
1326
    	if (not defined $shortest or $distance < $shortest) {
1322
	    $shortest = $distance;
1327
    	    t 'this is the best so far';
1323
	    $crossing_at = $i;
1328
    	    $shortest = $distance;
1324
	}
1329
    	    $crossing_at = $i;
1330
    	}
1325
    }
1331
    }
1332
1326
    t '@todo=' . d \@todo;
1333
    t '@todo=' . d \@todo;
1327
1334
1328
    # Now given the place at which we cross from $day to $next_day we
1335
    # Now given the place at which we cross from $day to $next_day we
1329
    # can add the appropriate days to the hh:mm times.
1336
    # can add the appropriate days to the hh:mm times.
1330
    #
1337
    #
1331
    if (not defined $crossing_at) {
1338
    if (not defined $crossing_at) {
1332
	push @$_, $day, $day foreach @todo;
1339
    	push @$_, $day, $day foreach @todo;
1333
    }
1340
    }
1334
    else {
1341
    else {
1335
	for (my $i = 0; $i < $crossing_at; ++$i) {
1342
    	for (my $i = 0; $i < $crossing_at; ++$i) {
1336
	    push @{$todo[$i]}, $day, $day;
1343
    	    push @{$todo[$i]}, $day, $day;
1337
	}
1344
    	}
1338
	for (my $i = $crossing_at) {
1345
    	for (my $i = $crossing_at) {
1339
	    push @{$todo[$i]}, $day, $next_day;
1346
    	    push @{$todo[$i]}, $day, $next_day;
1340
	}
1347
    	}
1341
	for (my $i = $crossing_at + 1; $i < @todo; ++$i) {
1348
    	for (my $i = $crossing_at + 1; $i < @todo; ++$i) {
1342
	    push @{$todo[$i]}, $next_day, $next_day;
1349
    	    push @{$todo[$i]}, $next_day, $next_day;
1343
	}
1350
    	}
1344
    }
1351
    }
1345
1352
1346
    # Now we can parse the dates into Date::Manip objects.
1353
    # Now we can parse the dates into Date::Manip objects.
1347
    my @r;
1354
    my @r;
1348
    foreach (@todo) {
1355
    foreach (@todo) {
1349
	my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_;
1356
    	my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_;
1350
	my ($start, $stop);
1357
    	my ($start, $stop);
1351
	if (defined $start_hhmm) {
1358
    	if (defined $start_hhmm) {
1352
	    $start = parse_local_date("$start_day $start_hhmm", $TZ);
1359
    	    $start = parse_local_date("$start_day $start_hhmm", $TZ);
1353
	}
1360
	    }
1354
	if (defined $stop_hhmm) {
1361
    	if (defined $stop_hhmm) {
1355
	    $stop = parse_local_date("$stop_day $stop_hhmm", $TZ);
1362
    	    $stop = parse_local_date("$stop_day $stop_hhmm", $TZ);
1356
	}
1363
    	}
1357
	push @r, [ $start, $stop, $title, $title_url ];
1364
    	push @r, [ $start, $stop, $title, $title_url ];
1358
    }
1365
    }
1366
1359
    t 'after parsing dates: ' . d \@r;
1367
    t 'after parsing dates: ' . d \@r;
1360
1368
1361
    # Check the dates and weed out those which are obviously wrong.
1369
    # Check the dates and weed out those which are obviously wrong.
1362
    my $last_start;
1370
    my $last_start;
1363
    foreach (@r) {
1371
    foreach (@r) {
1364
	our ($start, $stop);
1372
    	our ($start, $stop);
1365
	local (*start, *stop) = \ ($_->[0], $_->[1]);
1373
    	local (*start, *stop) = \ ($_->[0], $_->[1]);
1366
	t 'checking dates, $last_start=' . d $last_start;
1374
    	t 'checking dates, $last_start=' . d $last_start;
1367
	t '$start=' . d $start;
1375
    	t '$start=' . d $start;
1368
	t '$stop=' . d $stop;
1376
    	t '$stop=' . d $stop;
1369
	if (defined $start and defined $stop
1377
1370
	    and Date_Cmp($start, $stop) > 0) {
1378
    	if (defined $start and defined $stop
1371
	    # Appears to stop before it starts.  Assume the stop time
1379
	      and Date_Cmp($start, $stop) > 0) {
1372
	    # is bogus but the start time might be okay.
1380
    	    # Appears to stop before it starts.  Assume the stop time
1373
	    #
1381
    	    # is bogus but the start time might be okay.
1374
	    undef $stop;
1382
    	    #
1375
	}
1383
    	    undef $stop;
1376
	if (defined $last_start) {
1384
    	}
1377
	    if (defined $start and Date_Cmp($start, $last_start) < 0) {
1385
    
1378
		# Appears to start before previous start.
1386
    	if (defined $last_start) {
1379
		undef $start;
1387
    	    if (defined $start and Date_Cmp($start, $last_start) < 0) {
1380
	    }
1388
    		# Appears to start before previous start.
1389
    		undef $start;
1390
        }
1381
	    if (defined $stop and Date_Cmp($stop, $last_start) < 0) {
1391
	    if (defined $stop and Date_Cmp($stop, $last_start) < 0) {
1382
		# Stops before previous start - that's just as bad.
1392
    		# Stops before previous start - that's just as bad.
1383
		undef $stop;
1393
    		undef $stop;
1384
	    }
1394
	    }
1385
	}
1395
	}
1386
	$last_start = $start if defined $start;
1396
	$last_start = $start if defined $start;
Lines 1389-1407 Link Here
1389
1399
1390
    # If there is a 'next page' link do a recursive call to handle it.
1400
    # If there is a 'next page' link do a recursive call to handle it.
1391
    foreach (@bits) {
1401
    foreach (@bits) {
1392
	my ($type, $href, $text) = @$_;
1402
        my ($type, $href, $text) = @$_;
1393
	if ($type eq 'lijst_zender' and $text eq 'Volgende') {
1403
       	if ($type eq 'lijst_zender' and $text eq 'Volgende') {
1394
	    # Next page for this day.
1404
       	    # Next page for this day.
1395
	    my $next_url = URI->new_abs($href, $url);
1405
            my $next_url = URI->new_abs($href, $url);
1396
	    # Turn URL back into a string.
1406
      	    # Turn URL back into a string.
1397
	    push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref)
1407
       	    push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref)
1398
	}
1408
       	}
1399
	elsif ($type eq 'lijst_zender' and $text eq 'Vorige') {
1409
       	elsif ($type eq 'lijst_zender' and $text eq 'Vorige') {
1400
	    # Previous page for this day, assume already fetched.
1410
       	    # Previous page for this day, assume already fetched.
1401
	}
1411
       	}
1402
	else {
1412
       	else {
1403
	    warn "discarding leftover $type: $text";
1413
          	    warn "discarding leftover $type: $text";
1404
	}
1414
       	}
1405
    }
1415
    }
1406
1416
1407
    t 'returning tuples: ' . d \@r;
1417
    t 'returning tuples: ' . d \@r;
Lines 1448-1460 Link Here
1448
	s/\bdecember\b/December/g;
1458
	s/\bdecember\b/December/g;
1449
	s/\bdec\b/December/g;
1459
	s/\bdec\b/December/g;
1450
1460
1451
	s/\bzondag\b/Sunday/g;
1461
	s/\bZondag\b/Sunday/g;
1452
	s/\bmaandag\b/Monday/g;
1462
	s/\bMaandag\b/Monday/g;
1453
	s/\bdinsdag\b/Tuesday/g;
1463
	s/\bDinsdag\b/Tuesday/g;
1454
	s/\bwoensdag\b/Wednesday/g;
1464
	s/\bWoensdag\b/Wednesday/g;
1455
	s/\bdonderdag\b/Thursday/g;
1465
	s/\bDonderdag\b/Thursday/g;
1456
	s/\bvrijdag\b/Friday/g;
1466
	s/\bVrijdag\b/Friday/g;
1457
	s/\bzaterdag\b/Saturday/g;
1467
	s/\bZaterdag\b/Saturday/g;
1458
1468
1459
	my $r;
1469
	my $r;
1460
	eval { $r = parse_local_date($_, $TZ) };
1470
	eval { $r = parse_local_date($_, $TZ) };

Return to bug 94101