Lines 53-60
Link Here
|
53 |
|
53 |
|
54 |
=head1 AUTHOR |
54 |
=head1 AUTHOR |
55 |
|
55 |
|
56 |
Guido Diepen and Ed Avis (ed@membled.com). Originally based on |
56 |
First version by Guido Diepen and Ed Avis (ed@membled.com). |
57 |
tv_grab_fi by Matti Airas. |
57 |
Second version by Eric Bus (eric@fambus.nl). |
|
|
58 |
Originally based on tv_grab_fi by Matti Airas. |
58 |
|
59 |
|
59 |
=head1 BUGS |
60 |
=head1 BUGS |
60 |
|
61 |
|
Lines 115-122
Link Here
|
115 |
} |
116 |
} |
116 |
|
117 |
|
117 |
# Function prototypes. |
118 |
# Function prototypes. |
118 |
sub time_tot_str( $ ); |
119 |
sub time_to_str( $ ); |
119 |
sub time_van_str( $ ); |
|
|
120 |
sub get_channels( $ ); |
120 |
sub get_channels( $ ); |
121 |
sub process_summary_page( $$$$ ); |
121 |
sub process_summary_page( $$$$ ); |
122 |
sub parse_dutch_date( $ ); |
122 |
sub parse_dutch_date( $ ); |
Lines 328-339
Link Here
|
328 |
foreach $ch_did (@channels) { |
328 |
foreach $ch_did (@channels) { |
329 |
my $ch_xid = "$ch_did.tvgids.nl"; |
329 |
my $ch_xid = "$ch_did.tvgids.nl"; |
330 |
for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) { |
330 |
for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) { |
331 |
my $url = 'http://www.tvgids.nl/zoekprogramma.php' |
331 |
my $url = 'http://www.tvgids.nl/zoeken/' |
332 |
. "?station=$ch_did&interval=$i"; |
332 |
. "?station=$ch_did&genre=&interval=$i×lot=0"; |
333 |
my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d'); |
333 |
my $day = UnixDate(DateCalc($now, "+ $i days"), '%Y-%m-%d'); |
334 |
die if not defined $day; |
334 |
die if not defined $day; |
335 |
die if ref $url; |
335 |
die if ref $url; |
336 |
push @to_get, [ $url, $day, $ch_xid, $ch_did ]; |
336 |
push @to_get, [ $url, $day, $ch_xid, $ch_did ]; |
337 |
} |
337 |
} |
338 |
} |
338 |
} |
339 |
|
339 |
|
Lines 342-347
Link Here
|
342 |
my @summary_page_data; |
342 |
my @summary_page_data; |
343 |
my $bar = new XMLTV::ProgressBar('downloading summary', scalar @to_get) |
343 |
my $bar = new XMLTV::ProgressBar('downloading summary', scalar @to_get) |
344 |
if not $opt_quiet; |
344 |
if not $opt_quiet; |
|
|
345 |
|
345 |
foreach (@to_get) { |
346 |
foreach (@to_get) { |
346 |
my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_; |
347 |
my ($url, $day, $ch_xmltv_id, $ch_tvgids_id) = @$_; |
347 |
die if ref $url; |
348 |
die if ref $url; |
Lines 354-359
Link Here
|
354 |
|
355 |
|
355 |
# Now we've fetched the descriptions, we know the channel names. |
356 |
# Now we've fetched the descriptions, we know the channel names. |
356 |
foreach $ch_did (@channels) { |
357 |
foreach $ch_did (@channels) { |
|
|
358 |
print "processing channel #$ch_did\n"; |
357 |
$ch_name = $channels{$ch_did}; |
359 |
$ch_name = $channels{$ch_did}; |
358 |
die "did not see any name for $ch_did in any listing pages" |
360 |
die "did not see any name for $ch_did in any listing pages" |
359 |
if not defined $ch_name; |
361 |
if not defined $ch_name; |
Lines 364-369
Link Here
|
364 |
|
366 |
|
365 |
my @summary_programmes; |
367 |
my @summary_programmes; |
366 |
my %detail_url_to_summary_url; |
368 |
my %detail_url_to_summary_url; |
|
|
369 |
|
367 |
foreach (@summary_page_data) { |
370 |
foreach (@summary_page_data) { |
368 |
my ($summary_url, $ch_xmltv_id, $data) = @$_; |
371 |
my ($summary_url, $ch_xmltv_id, $data) = @$_; |
369 |
|
372 |
|
Lines 373-514
Link Here
|
373 |
# |
376 |
# |
374 |
my $clump; # [ start, stop, programmes ]. stop may be undef. |
377 |
my $clump; # [ start, stop, programmes ]. stop may be undef. |
375 |
my @clumps; |
378 |
my @clumps; |
|
|
379 |
|
376 |
foreach (@$data) { |
380 |
foreach (@$data) { |
377 |
my ($start, $stop, $title, $url) = @$_; |
381 |
my ($start, $stop, $title, $url) = @$_; |
378 |
die if ref $url; |
382 |
die if ref $url; |
379 |
local $SIG{__WARN__} = sub { warn "$url: $_[0]" }; |
383 |
local $SIG{__WARN__} = sub { warn "$url: $_[0]" }; |
380 |
for ($detail_url_to_summary_url{$url}) { |
384 |
for ($detail_url_to_summary_url{$url}) { |
381 |
warn "more than one programme with same details page $url" |
385 |
warn "more than one programme with same details page $url" |
382 |
if defined; |
386 |
if defined; |
383 |
$_ = $summary_url; |
387 |
$_ = $summary_url; |
384 |
} |
388 |
} |
385 |
|
389 |
|
386 |
# Start and stop are common to a whole clump, but these |
390 |
# Start and stop are common to a whole clump, but these |
387 |
# two are given individually for each programme. |
391 |
# two are given individually for each programme. |
388 |
# |
392 |
# |
389 |
my $details = [ $title, $url ]; |
393 |
my $details = [ $title, $url ]; |
390 |
|
394 |
|
391 |
if (not defined $start) { |
395 |
if (not defined $start) { |
392 |
if (not $clump) { |
396 |
if (not $clump) { |
393 |
warn "programme '$title' at beginning of page has no start time, dropping\n"; |
397 |
warn "programme '$title' at beginning of page has no start time, dropping\n"; |
394 |
next; |
398 |
next; |
395 |
} |
399 |
} |
396 |
t 'found programme with no start time'; |
400 |
t 'found programme with no start time'; |
397 |
|
401 |
|
398 |
if (defined $clump->[1]) { |
402 |
if (defined $clump->[1]) { |
399 |
t 'make it start at stop of last clump'; |
403 |
t 'make it start at stop of last clump'; |
400 |
die if not defined $clump->[0]; |
404 |
die if not defined $clump->[0]; |
401 |
push @clumps, $clump; |
405 |
push @clumps, $clump; |
402 |
$clump = [ $clump->[1], $stop, [ $details ] ]; |
406 |
$clump = [ $clump->[1], $stop, [ $details ] ]; |
403 |
die if not defined $clump->[0]; |
407 |
die if not defined $clump->[0]; |
404 |
} |
408 |
} |
405 |
else { |
409 |
else { |
406 |
t 'current clump has no stop, add to clump'; |
410 |
t 'current clump has no stop, add to clump'; |
407 |
push @{$clump->[2]}, $details; |
411 |
push @{$clump->[2]}, $details; |
408 |
t 'maybe set stop of current clump'; |
412 |
t 'maybe set stop of current clump'; |
409 |
$clump->[1] = $stop; |
413 |
$clump->[1] = $stop; |
410 |
} |
414 |
} |
411 |
} |
415 |
} |
412 |
else { |
416 |
else { |
413 |
t 'programme has start time, make new clump'; |
417 |
t 'programme has start time, make new clump'; |
414 |
if ($clump) { |
418 |
if ($clump) { |
415 |
die if not defined $clump->[0]; |
419 |
die if not defined $clump->[0]; |
416 |
my $cmp = Date_Cmp($clump->[0], $start); |
420 |
my $cmp = Date_Cmp($clump->[0], $start); |
417 |
if ($cmp == 0) { |
421 |
|
418 |
# Oddity in the web pages: this programme has the |
422 |
if ($cmp == 0) { |
419 |
# same start time as a previous one. (Until I |
423 |
# Oddity in the web pages: this programme has the |
420 |
# found this case, the clumping was only for cases |
424 |
# same start time as a previous one. (Until I |
421 |
# where a programme lacked a start time, I think.) |
425 |
# found this case, the clumping was only for cases |
422 |
# Anyway handle this by making it join the |
426 |
# where a programme lacked a start time, I think.) |
423 |
# existing clump. |
427 |
# Anyway handle this by making it join the |
424 |
# |
428 |
# existing clump. |
425 |
t 'same start time as existing clump, join it'; |
429 |
# |
426 |
|
430 |
t 'same start time as existing clump, join it'; |
427 |
if (defined $stop) { |
431 |
|
428 |
# Compare stop time of the current clump with |
432 |
if (defined $stop) { |
429 |
# this programme and extend the clump if |
433 |
# Compare stop time of the current clump with |
430 |
# necessary. |
434 |
# this programme and extend the clump if |
431 |
# |
435 |
# necessary. |
432 |
my $cmp = Date_Cmp($clump->[1], $stop); |
436 |
# |
433 |
if ($cmp < 0) { |
437 |
my $cmp = Date_Cmp($clump->[1], $stop); |
434 |
# Okay, later stop time, extend clump. |
438 |
if ($cmp < 0) { |
435 |
$clump->[1] = $stop; |
439 |
# Okay, later stop time, extend clump. |
436 |
} |
440 |
$clump->[1] = $stop; |
437 |
elsif ($cmp == 0) { |
441 |
} |
438 |
# Okay. |
442 |
elsif ($cmp == 0) { |
439 |
} |
443 |
# Okay. |
440 |
elsif ($cmp > 0) { |
444 |
} |
441 |
warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n"; |
445 |
elsif ($cmp > 0) { |
442 |
} |
446 |
warn "programme (from $start to $stop) has same start as one before it, but earlier stop, ignoring stop time\n"; |
443 |
else { die } |
447 |
} |
444 |
} |
448 |
else { die } |
445 |
|
449 |
} |
446 |
push @{$clump->[2]}, $details; |
450 |
|
447 |
} |
451 |
push @{$clump->[2]}, $details; |
448 |
else { |
452 |
} |
449 |
# (Don't bother checking that $cmp < 0, it should |
453 |
else { |
450 |
# be, but often programmes appear out of order...) |
454 |
# (Don't bother checking that $cmp < 0, it should |
451 |
# |
455 |
# be, but often programmes appear out of order...) |
452 |
push @clumps, $clump; |
456 |
# |
453 |
$clump = [ $start, $stop, [ $details ] ]; |
457 |
push @clumps, $clump; |
454 |
t 'started a new clump for new start time, now: ' . d $clump; |
458 |
$clump = [ $start, $stop, [ $details ] ]; |
455 |
} |
459 |
t 'started a new clump for new start time, now: ' . d $clump; |
456 |
} |
460 |
} |
457 |
else { |
461 |
} |
458 |
t 'no existing clump, starting one for this programme'; |
462 |
else { |
459 |
$clump = [ $start, $stop, [ $details ] ]; |
463 |
t 'no existing clump, starting one for this programme'; |
460 |
} |
464 |
$clump = [ $start, $stop, [ $details ] ]; |
461 |
die if not defined $clump->[0]; |
465 |
} |
462 |
} |
466 |
|
|
|
467 |
die if not defined $clump->[0]; |
468 |
} |
463 |
} |
469 |
} |
|
|
470 |
|
464 |
if ($clump) { |
471 |
if ($clump) { |
465 |
die if not defined $clump->[0]; |
472 |
die if not defined $clump->[0]; |
466 |
push @clumps, $clump; |
473 |
push @clumps, $clump; |
467 |
} |
474 |
} |
|
|
475 |
|
468 |
t '\@clumps=' . d \@clumps; |
476 |
t '\@clumps=' . d \@clumps; |
469 |
|
477 |
|
470 |
# Now add the clumpidx attributes. |
478 |
# Now add the clumpidx attributes. |
471 |
foreach (@clumps) { |
479 |
foreach (@clumps) { |
472 |
my ($start, $stop, $l) = @$_; |
480 |
my ($start, $stop, $l) = @$_; |
473 |
die if not defined $start; |
481 |
die if not defined $start; |
474 |
my $num_in_clump = @$l; |
482 |
my $num_in_clump = @$l; |
475 |
if ($num_in_clump == 1) { |
483 |
if ($num_in_clump == 1) { |
476 |
# Common case, no clumpidx needed. |
484 |
# Common case, no clumpidx needed. |
477 |
} |
485 |
} |
478 |
elsif ($num_in_clump > 1) { |
486 |
elsif ($num_in_clump > 1) { |
479 |
foreach my $i (0 .. $num_in_clump - 1) { |
487 |
foreach my $i (0 .. $num_in_clump - 1) { |
480 |
# Add clumpidx as last thing in list. |
488 |
# Add clumpidx as last thing in list. |
481 |
push @{$l->[$i]}, "$i/$num_in_clump"; |
489 |
push @{$l->[$i]}, "$i/$num_in_clump"; |
482 |
} |
490 |
} |
483 |
} |
491 |
} |
484 |
else { die } |
492 |
else { die } |
485 |
} |
493 |
} |
|
|
494 |
|
486 |
t 'after adding clumpidxes, \@clumps=' . d \@clumps; |
495 |
t 'after adding clumpidxes, \@clumps=' . d \@clumps; |
487 |
|
496 |
|
488 |
# Finally turn the data into programmes. |
497 |
# Finally turn the data into programmes. |
489 |
foreach (@clumps) { |
498 |
foreach (@clumps) { |
490 |
my ($start, $stop, $l) = @$_; |
499 |
my ($start, $stop, $l) = @$_; |
491 |
die if not defined $start; |
500 |
die if not defined $start; |
492 |
foreach (@$l) { |
501 |
foreach (@$l) { |
493 |
my ($title, $url, $clumpidx) = @$_; |
502 |
my ($title, $url, $clumpidx) = @$_; |
494 |
my %h = (channel => $ch_xmltv_id, |
503 |
my %h = (channel => $ch_xmltv_id, |
495 |
title => [ [ $title, $LANG ] ], |
504 |
title => [ [ $title, $LANG ] ], |
496 |
); |
505 |
); |
497 |
for (date_to_local($start, $TZ)) { |
506 |
|
498 |
$h{start} = UnixDate($_->[0], '%q') . " $_->[1]"; |
507 |
for (date_to_local($start, $TZ)) { |
|
|
508 |
$h{start} = UnixDate($_->[0], '%q') . " $_->[1]"; |
499 |
} |
509 |
} |
|
|
510 |
|
500 |
if (defined $stop) { |
511 |
if (defined $stop) { |
501 |
for (date_to_local($stop, $TZ)) { |
512 |
for (date_to_local($stop, $TZ)) { |
502 |
$h{stop} = UnixDate($_->[0], '%q') . " $_->[1]"; |
513 |
$h{stop} = UnixDate($_->[0], '%q') . " $_->[1]"; |
503 |
} |
514 |
} |
504 |
} |
515 |
} |
505 |
|
516 |
|
506 |
if (defined $url) { |
517 |
if (defined $url) { |
507 |
die if ref $url; |
518 |
die if ref $url; |
508 |
$h{url} = [ $url ]; |
519 |
$h{url} = [ $url ]; |
509 |
} |
520 |
} |
510 |
$h{clumpidx} = $clumpidx if defined $clumpidx; |
521 |
|
511 |
push @summary_programmes, \%h; |
522 |
$h{clumpidx} = $clumpidx if defined $clumpidx; |
|
|
523 |
push @summary_programmes, \%h; |
512 |
} |
524 |
} |
513 |
} |
525 |
} |
514 |
} |
526 |
} |
Lines 573-605
Link Here
|
573 |
my $new = $detailed->{$_}; |
585 |
my $new = $detailed->{$_}; |
574 |
|
586 |
|
575 |
if ($_ eq 'title') { |
587 |
if ($_ eq 'title') { |
576 |
# We know how to merge this. TODO write general |
588 |
# We know how to merge this. TODO write general |
577 |
# XMLTV::Merge. |
589 |
# XMLTV::Merge. |
578 |
# |
590 |
# |
579 |
my %already; |
591 |
my %already; |
580 |
foreach my $a (@$new) { |
592 |
foreach my $a (@$new) { |
581 |
my $d = Dumper($a); |
593 |
my $d = Dumper($a); |
582 |
$already{$d}++ && warn "duplicate $_: $d"; |
594 |
$already{$d}++ && warn "duplicate $_: $d"; |
583 |
} |
595 |
} |
584 |
foreach my $o (@$old) { |
596 |
foreach my $o (@$old) { |
585 |
my $d = Dumper($o); |
597 |
my $d = Dumper($o); |
586 |
push @$new, $o unless $already{$d}; |
598 |
push @$new, $o unless $already{$d}; |
587 |
} |
599 |
} |
588 |
} |
600 |
} |
589 |
else { |
601 |
else { |
590 |
# Compare the two data structures. For this to work |
602 |
# Compare the two data structures. For this to work |
591 |
# correctly it requires Data::Dumper 2.12 or later, as |
603 |
# correctly it requires Data::Dumper 2.12 or later, as |
592 |
# shipped with perl 5.8.0. Older versions don't |
604 |
# shipped with perl 5.8.0. Older versions don't |
593 |
# support $Sortkeys. But we don't have any version |
605 |
# support $Sortkeys. But we don't have any version |
594 |
# check here - in the worst case all that results from |
606 |
# check here - in the worst case all that results from |
595 |
# using an older Data::Dumper is a few spurious |
607 |
# using an older Data::Dumper is a few spurious |
596 |
# warning messages. |
608 |
# warning messages. |
597 |
# |
609 |
# |
598 |
my $old_dump = Dumper($old); |
610 |
my $old_dump = Dumper($old); |
599 |
my $new_dump = Dumper($new); |
611 |
my $new_dump = Dumper($new); |
600 |
if ($old_dump ne $new_dump) { |
612 |
if ($old_dump ne $new_dump) { |
601 |
warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n"; |
613 |
warn "mismatch between summary page and details page $url for $_: $old_dump vs $new_dump\n"; |
602 |
} |
614 |
} |
603 |
} |
615 |
} |
604 |
} |
616 |
} |
605 |
|
617 |
|
Lines 629-648
Link Here
|
629 |
my $warned_bad_chars; |
641 |
my $warned_bad_chars; |
630 |
sub tidy( $ ) { |
642 |
sub tidy( $ ) { |
631 |
for (my $tmp = shift) { |
643 |
for (my $tmp = shift) { |
632 |
tr/\221\222/''/; |
644 |
tr/\221\222/''/; |
633 |
if (tr/\012\015\040-\176\240-\377//dc) { |
645 |
if (tr/\012\015\040-\176\240-\377//dc) { |
634 |
warn 'removing bad characters' unless $warned_bad_chars++; |
646 |
warn 'removing bad characters' unless $warned_bad_chars++; |
635 |
} |
647 |
} |
636 |
return $_; |
648 |
return $_; |
637 |
} |
649 |
} |
638 |
} |
650 |
} |
639 |
|
651 |
|
640 |
# Returns a programme hashref, or undef, or the magic 'END'. |
652 |
# Returns a programme hashref, or undef, or the magic 'END'. |
641 |
sub process_details_page( $$$ ) { |
653 |
sub process_details_page( $$$ ) { |
642 |
foreach (@_) { die if ref } |
654 |
foreach (@_) { die if ref } |
|
|
655 |
|
643 |
my ($ch_xmltv_id, $url, $master_url) = @_; |
656 |
my ($ch_xmltv_id, $url, $master_url) = @_; |
|
|
657 |
|
644 |
local $SIG{__WARN__} = sub { |
658 |
local $SIG{__WARN__} = sub { |
645 |
warn "$url (from $master_url): $_[0]"; |
659 |
warn "$url (from $master_url): $_[0]"; |
646 |
}; |
660 |
}; |
647 |
|
661 |
|
648 |
# We make an HTML::TreeBuilder object, get the information |
662 |
# We make an HTML::TreeBuilder object, get the information |
Lines 650-947
Link Here
|
650 |
# |
664 |
# |
651 |
my $t = new HTML::TreeBuilder(); |
665 |
my $t = new HTML::TreeBuilder(); |
652 |
eval { |
666 |
eval { |
653 |
$t->parse(tidy(get_nice($url))); |
667 |
$t->parse(tidy(get_nice($url))); |
654 |
}; |
668 |
}; |
|
|
669 |
|
655 |
if ($@) { |
670 |
if ($@) { |
656 |
warn "error getting/parsing $url: $@"; |
671 |
warn "error getting/parsing $url: $@"; |
657 |
return; |
672 |
return; |
658 |
} |
673 |
} |
659 |
my @elems = $t->look_down(class => 'detailDeel'); |
674 |
|
660 |
if (not @elems) { |
675 |
my @elems = $t->look_down('id' => 'progDetail'); |
661 |
warn "did not see any 'detailDeel' elements, skipping page"; |
676 |
if (not @elems or @elems != 1) { |
662 |
return; |
677 |
warn "did not see one single 'progDetail' element, skipping page"; |
|
|
678 |
return; |
663 |
} |
679 |
} |
|
|
680 |
|
681 |
my $elem = $elems[0]; |
682 |
my @hs = $elem->look_down('_tag' => 'h3'); |
683 |
if (not @hs or @hs != 2) { |
684 |
warn "could not find a valid 'h3' title, skipping page"; |
685 |
return; |
686 |
} |
687 |
|
688 |
my $naam = $hs[0]->as_text(); |
689 |
$naam =~ s/^\s+//; $naam =~ s/\s+$//; |
690 |
|
691 |
my @desc; # accumulate bits |
692 |
my @ps = $elem->look_down('_tag' => 'p'); |
693 |
if (not @ps) { |
694 |
warn "could not find a valid description, skipping page"; |
695 |
return; |
696 |
} |
697 |
|
698 |
for( my $i=0; $i<=2; $i++ ) |
699 |
{ |
700 |
my $text = $ps[$i]->as_text(); |
701 |
$text =~ s/^\s+//; |
702 |
$text =~ s/\s+$//; |
703 |
push @desc, $text if $text ne ''; |
704 |
} |
705 |
|
706 |
@elems = $t->look_down('id' => 'data'); |
707 |
if (not @elems or @elems != 1) { |
708 |
warn "did not see one single 'data' element, skipping page"; |
709 |
return; |
710 |
} |
711 |
|
664 |
my @info; |
712 |
my @info; |
665 |
foreach (@elems) { |
713 |
$elem = $elems[0]; |
666 |
my @cont = grep { ref } $_->content_list(); |
714 |
my @trs = $elem->look_down('_tag' => 'tr'); |
667 |
my $n = scalar @cont; |
715 |
if( not @trs ) { |
668 |
if ($n != 2) { |
716 |
warn "did not find records below the 'data' element, skipping page"; |
669 |
warn "'detailDeel' has $n elements instead of 2"; |
717 |
return; |
670 |
next; |
718 |
} |
671 |
} |
719 |
|
672 |
my ($k, $v) = @cont; |
720 |
foreach (@trs) { |
673 |
for ($k->attr('class')) { |
721 |
my $tr = $_; |
674 |
if (not defined or $_ ne 'detailLabel2') { |
722 |
my @ths = $tr->look_down('_tag' => 'th'); |
675 |
warn "didn't see 'detailLabel2' in 'detailDeel'"; |
723 |
my @tds = $tr->look_down('_tag' => 'td'); |
676 |
next; |
724 |
if( @ths == 1 && @tds == 1 ) { |
677 |
} |
725 |
push @info, [ $ths[0]->as_text(), $tds[0]->as_text() ]; |
678 |
} |
726 |
} |
679 |
for ($v->attr('class')) { |
|
|
680 |
if (not defined or $_ ne 'detailContent2') { |
681 |
warn "didn't see 'detailContent2' in 'detailDeel'"; |
682 |
next; |
683 |
} |
684 |
} |
685 |
push @info, [ $k->as_text(), $v->as_text() ]; |
686 |
} |
727 |
} |
687 |
$t->delete(); undef $t; |
728 |
$t->delete(); undef $t; |
688 |
|
729 |
|
689 |
# Process the list of [ heading, data ] pairs. |
730 |
# Process the list of [ heading, data ] pairs. |
690 |
my ( |
731 |
my ( |
691 |
# Exactly one: |
732 |
# Exactly one: |
692 |
$van, $tot, $naam, |
733 |
$van, $tot, |
693 |
|
734 |
|
694 |
# At most one: |
735 |
# At most one: |
695 |
$director, $previously_shown, $orig_title, $sub_title, $genre, |
736 |
$director, $previously_shown, $orig_title, $sub_title, $genre, |
696 |
$date, $episode_num, $actors, $writers, $commentators, |
737 |
$date, $episode_num, $actors, $writers, $commentators, |
697 |
|
738 |
|
698 |
# Zero or more: |
739 |
# Zero or more: |
699 |
@presenter, @url, |
740 |
@presenter, @url, |
700 |
); |
741 |
); |
|
|
742 |
|
701 |
# NB 'at most one' $actors but that one entry can give several. |
743 |
# NB 'at most one' $actors but that one entry can give several. |
702 |
|
744 |
|
703 |
my ($teletext_sub, $widescreen) = 0; # boolean |
745 |
my ($teletext_sub, $widescreen) = 0; # boolean |
704 |
my @desc; # accumulate bits |
|
|
705 |
my $seen_tijdstip = 0; |
746 |
my $seen_tijdstip = 0; |
706 |
my $last; |
747 |
my $last; |
707 |
ELEM: foreach (@info) { |
|
|
708 |
my ($regel, $text) = @$_; |
709 |
foreach ($regel, $text) { |
710 |
s/^\s+//; s/\s+$//; |
711 |
} |
712 |
|
713 |
if ($regel eq '') { |
714 |
# Continuation of the previous one, hopefully. |
715 |
$regel = $last; |
716 |
} |
717 |
else { |
718 |
# They usually end with a colon but not always. |
719 |
$regel =~ s/:$//; |
720 |
$last = $regel; |
721 |
} |
722 |
|
748 |
|
723 |
if ($regel eq 'Tijdstip') { |
749 |
ELEM: foreach (@info) { |
724 |
warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++; |
750 |
my ($regel, $text) = @$_; |
725 |
if (length($text)<=16) { |
751 |
foreach ($regel, $text) { |
726 |
t "'onvolledig' is true, nothing more to write"; |
752 |
s/^\s+//; s/\s+$//; |
727 |
t 'process_details_page() RETURNING'; |
753 |
} |
728 |
return 'END'; |
754 |
|
729 |
} |
755 |
if ($regel eq '') { |
730 |
else { |
756 |
# Continuation of the previous one, hopefully. |
731 |
# Extract time strings from the text, but not full |
757 |
$regel = $last; |
732 |
# Date::Manip objects. |
758 |
} |
733 |
# |
759 |
else { |
734 |
$van = time_van_str($text); |
760 |
# They usually end with a colon but not always. |
735 |
$tot = time_tot_str($text); |
761 |
$regel =~ s/:$//; |
736 |
} |
762 |
$last = $regel; |
737 |
} |
763 |
} |
738 |
elsif ($regel eq 'Inhoud') { |
764 |
|
739 |
# Empty text for this happens often, just skip it. |
765 |
if ($regel eq 'Datum en tijdstip') { |
740 |
push @desc, $text if $text ne ''; |
766 |
warn "seen 'Tijdstip' twice\n" if $seen_tijdstip++; |
741 |
} |
767 |
if (length($text)<=16) { |
742 |
elsif ($regel eq 'Programma') { |
768 |
t "'onvolledig' is true, nothing more to write"; |
743 |
warn "seen 'Programma' twice\n" if defined $naam; |
769 |
t 'process_details_page() RETURNING'; |
744 |
# FIXME should really look for 'herhaling' in italics. |
770 |
return 'END'; |
745 |
if ($text =~ s/\bherhaling\s+van\s+(\d\d?)-(\d\d?)-(\d{4})//) { |
771 |
} |
746 |
warn "seen previously-shown information twice\n" |
772 |
else { |
747 |
if $previously_shown; |
773 |
# Extract time strings from the text, but not full |
748 |
my ($dd, $mm, $yyyy) = ($1, $2, $3); |
774 |
# Date::Manip objects. |
749 |
$previously_shown = { start => "$yyyy$mm$dd" }; |
775 |
# |
750 |
$text =~ s/^\s+//; $text =~ s/\s+$//; |
776 |
($van,$tot) = time_to_str($text); |
751 |
} |
777 |
} |
752 |
elsif ($text =~ s/herhaling\b//) { |
778 |
} |
753 |
# Repeat, but no previous date given. NB |
779 |
elsif ($regel eq 'Inhoud') { |
754 |
# sometimes we see 'herhaling' without a space |
780 |
# Empty text for this happens often, just skip it. |
755 |
# before it, as in the redundant |
781 |
push @desc, $text if $text ne ''; |
756 |
# |
782 |
} |
757 |
# 'Netwerk herhalingenherhaling' |
783 |
elsif ($regel eq 'Genre') { |
758 |
# |
784 |
warn "seen 'Genre' twice\n" if defined $genre; |
759 |
# Hence no \b at the start of the regexp. We just |
785 |
# Empty text for this happens often, just skip it. |
760 |
# have to hope there aren't too many compound |
786 |
$genre = $text if $text ne ''; |
761 |
# words ending in 'herhaling'. |
787 |
} |
762 |
# |
788 |
elsif ($regel eq 'Zender') { |
763 |
$previously_shown = {}; |
789 |
# In the new layout, this field contains the logo |
764 |
$text =~ s/^\s+//; $text =~ s/\s+$//; |
790 |
# for the channel. Ignore it, because these logos |
765 |
} |
791 |
# are too small to use in an application. |
766 |
$naam = $text; |
792 |
# |
767 |
} |
793 |
} |
768 |
elsif ($regel eq 'Genre') { |
794 |
elsif ($regel eq 'Omroep') { |
769 |
warn "seen 'Genre' twice\n" if defined $genre; |
795 |
# We ignore this setting, because the XMLTV format |
770 |
# Empty text for this happens often, just skip it. |
796 |
# doesn't have room for this. In Holland, a few broadcasters |
771 |
$genre = $text if $text ne ''; |
797 |
# share the same channel. |
772 |
} |
798 |
# |
773 |
elsif ($regel eq 'Zender') { |
799 |
} |
774 |
# I think this means 'broadcaster' but the information |
800 |
elsif ($regel eq 'Bijzonderheden') { |
775 |
# is redundant because we already know the channel. |
801 |
foreach (split /,\s*/, $text) { |
776 |
# |
802 |
if ($_ eq 'Teletekst ondertiteld') { |
777 |
# Then we should check it and warn if it differs! But |
803 |
# I'm guessing this means teletext subtitles :-). |
778 |
# it does differ - every programme on the channel |
804 |
$teletext_sub++ |
779 |
# Nederland 1, it seems, has Zender of 'Nederland |
805 |
&& warn 'seen teletext subtitles twice'; |
780 |
# 2'. So we just ignore this information. |
806 |
} |
781 |
# |
807 |
elsif ($_ eq 'Breedbeeld uitzending') { |
782 |
} |
808 |
$widescreen++ && warn 'seen widescreen twice'; |
783 |
elsif ($regel eq 'Omroep') { |
809 |
} |
784 |
# FIXME I don't know what this means (the dictionary |
810 |
elsif (length >= 50) { |
785 |
# says 'wireless telegraph' but that's no help) so |
811 |
# Some long sentence, part of description. |
786 |
# just ignore it. |
812 |
push @desc, $_; |
787 |
# |
813 |
} |
788 |
} |
814 |
else { |
789 |
elsif ($regel eq 'Kenmerken') { |
815 |
warn "unknown 'Bijzonderheden' bit $_" |
790 |
foreach (split /,\s*/, $text) { |
816 |
unless $warned_regel{"Bijzonderheden: $_"}++; |
791 |
if ($_ eq 'Teletekst ondertiteld') { |
817 |
push @desc, $_; |
792 |
# I'm guessing this means teletext subtitles :-). |
818 |
} |
793 |
$teletext_sub++ |
819 |
} |
794 |
&& warn 'seen teletext subtitles twice'; |
820 |
} |
795 |
} |
821 |
elsif ($regel eq 'Teletekst') { |
796 |
elsif ($_ eq 'Breedbeeld uitzending') { |
822 |
# Teletekst contains the 'teletext' page for this programme |
797 |
$widescreen++ && warn 'seen widescreen twice'; |
823 |
# This information isn't used at the moment |
798 |
} |
824 |
} |
799 |
elsif (length >= 50) { |
825 |
elsif ($regel eq 'Presentatie') { |
800 |
# Some long sentence, part of description. |
826 |
push @presenter, $text; |
801 |
push @desc, $_; |
827 |
} |
802 |
} |
828 |
elsif ($regel eq 'Aflevering') { |
803 |
else { |
829 |
warn "seen 'Aflevering" |
804 |
warn "unknown 'Kenmerken' bit $_" |
830 |
if defined $episode_num; |
805 |
unless $warned_regel{"Kenmerken: $_"}++; |
831 |
if ($text eq 'Slot') { |
806 |
push @desc, $_; |
832 |
# The last episode of a series. There isn't a way to |
807 |
} |
833 |
# store this in the current XMLTV format. |
808 |
} |
834 |
# |
809 |
} |
835 |
warn "discarding 'Slot'" unless $warned_slot++; |
810 |
elsif ($regel eq 'Presentatie') { |
836 |
} |
811 |
push @presenter, $text; |
837 |
elsif ($text =~ /^\d+$/) { |
812 |
} |
838 |
if ($text == 0) { |
813 |
elsif ($regel eq 'Afleverings nummer') { |
839 |
warn "I thought episode nums on the site were from 1"; |
814 |
warn "seen 'Afleverings nummer' twice" |
840 |
} |
815 |
if defined $episode_num; |
841 |
else { |
816 |
if ($text eq 'Slot') { |
842 |
$episode_num = $text - 1; |
817 |
# The last episode of a series. There isn't a way to |
843 |
} |
818 |
# store this in the current XMLTV format. |
844 |
} |
819 |
# |
845 |
elsif ($text =~ /^(?:\d+-)+\d$/) { |
820 |
warn "discarding 'Slot'" unless $warned_slot++; |
846 |
# This means multiple episodes. This ought to be |
821 |
} |
847 |
# handled by turning the programme into a clump. |
822 |
elsif ($text =~ /^\d+$/) { |
848 |
# |
823 |
if ($text == 0) { |
849 |
warn "programme covers multiple episodes ($text), not handled"; |
824 |
warn "I thought episode nums on the site were from 1"; |
850 |
} |
825 |
} |
851 |
else { |
826 |
else { |
852 |
warn "bad episode number $text"; |
827 |
$episode_num = $text - 1; |
853 |
} |
828 |
} |
854 |
} |
829 |
} |
855 |
elsif ($regel eq 'Titel aflevering') { |
830 |
elsif ($text =~ /^(?:\d+-)+\d$/) { |
856 |
warn "seen 'Titel aflevering' twice" |
831 |
# This means multiple episodes. This ought to be |
857 |
if defined $sub_title; |
832 |
# handled by turning the programme into a clump. |
858 |
$sub_title = $text; |
833 |
# |
859 |
} |
834 |
warn "programme covers multiple episodes ($text), not handled"; |
860 |
elsif ($regel eq 'Url') { |
835 |
} |
861 |
# We have to turn the string given, which is normally |
836 |
else { |
862 |
# just a hostname, into a URL. I don't see why they |
837 |
warn "bad episode number $text"; |
863 |
# don't just link to it directly, this is a web site |
838 |
} |
864 |
# after all. |
839 |
} |
865 |
# |
840 |
elsif ($regel eq 'Titel aflevering') { |
866 |
# Anyway, the URI library doesn't seem to have any way |
841 |
warn "seen 'Titel aflevering' twice" |
867 |
# to take a string and turn it into a URL adding |
842 |
if defined $sub_title; |
868 |
# 'http:' if necessary, so we do this by hand. |
843 |
$sub_title = $text; |
869 |
# |
844 |
} |
870 |
if ($text !~ tr/://) { |
845 |
elsif ($regel eq 'Webpagina') { |
871 |
$text = "http://$text"; |
846 |
# We have to turn the string given, which is normally |
872 |
} |
847 |
# just a hostname, into a URL. I don't see why they |
873 |
push @url, $text; |
848 |
# don't just link to it directly, this is a web site |
874 |
} |
849 |
# after all. |
875 |
elsif ($regel eq 'Acteurs') { |
850 |
# |
876 |
warn "seen 'Rolverdeling' twice" if $actors; |
851 |
# Anyway, the URI library doesn't seem to have any way |
877 |
|
852 |
# to take a string and turn it into a URL adding |
878 |
# 'e.a' appearing in the description means 'and others'; |
853 |
# 'http:' if necessary, so we do this by hand. |
879 |
# it's implicit in XMLTV that there might be other actors, |
854 |
# |
880 |
# so we quietly remove it. |
855 |
if ($text !~ tr/://) { |
881 |
# |
856 |
$text = "http://$text"; |
882 |
$text =~ s/\s*e\.a\s*$//; |
857 |
} |
883 |
|
858 |
push @url, $text; |
884 |
while (length $text) { |
859 |
} |
885 |
if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) { |
860 |
elsif ($regel eq 'Rolverdeling') { |
886 |
warn "discarding information about the parts played by each actor\n" |
861 |
warn "seen 'Rolverdeling' twice" if $actors; |
887 |
unless $warned_discarding_parts++; |
862 |
|
888 |
push @$actors, $2; |
863 |
# 'e.a' appearing in the description means 'and others'; |
889 |
} |
864 |
# it's implicit in XMLTV that there might be other actors, |
890 |
elsif ($text =~ s/\s*([^,]+)(?:$|,)//) { |
865 |
# so we quietly remove it. |
891 |
push @$actors, $1; |
866 |
# |
892 |
} |
867 |
$text =~ s/\s*e\.a\s*$//; |
893 |
else { |
868 |
|
894 |
warn "unknown remnant 'Rolverdeling' text '$text'"; |
869 |
while (length $text) { |
895 |
last; |
870 |
if ($text =~ s/\s*([^:]+):\s*([^.]+)(?:$|\.)//) { |
896 |
} |
871 |
warn "discarding information about the parts played by each actor\n" |
897 |
} |
872 |
unless $warned_discarding_parts++; |
898 |
} |
873 |
push @$actors, $2; |
899 |
elsif ($regel eq 'Scenario') { |
874 |
} |
900 |
warn "seen 'Scenario' twice" if $writers; |
875 |
elsif ($text =~ s/\s*([^,]+)(?:$|,)//) { |
901 |
push @$writers, $text; |
876 |
push @$actors, $1; |
902 |
} |
877 |
} |
903 |
elsif ($regel eq 'Email') { |
878 |
else { |
904 |
push @url, "mailto:$text"; |
879 |
warn "unknown remnant 'Rolverdeling' text '$text'"; |
905 |
} |
880 |
last; |
906 |
elsif ($regel eq 'Bron') { |
881 |
} |
907 |
# FIXME cannot do anything special with this. It |
882 |
} |
908 |
# means 'source' and perhaps by parsing the text we |
883 |
} |
909 |
# could find the names of writers or whatever. |
884 |
elsif ($regel eq 'Scenario schrijver') { |
910 |
# |
885 |
warn "seen 'Scenario schrijver' twice" if $writers; |
911 |
# push @desc, "$regel: $text"; |
886 |
push @$writers, $text; |
912 |
} |
887 |
} |
913 |
elsif ($regel eq 'Commentaar') { |
888 |
elsif ($regel eq 'E-mail') { |
914 |
push @$commentators, $text; |
889 |
push @url, "mailto:$text"; |
915 |
} |
890 |
} |
916 |
elsif ($regel eq 'Jaar van premiere') { |
891 |
elsif ($regel eq 'Bron') { |
917 |
# Year of release, I think. |
892 |
# FIXME cannot do anything special with this. It |
918 |
warn "seen 'Jaar van premiere' twice" |
893 |
# means 'source' and perhaps by parsing the text we |
919 |
if defined $date; |
894 |
# could find the names of writers or whatever. |
920 |
$date = $text; |
895 |
# |
921 |
} |
896 |
push @desc, "$regel: $text"; |
922 |
elsif ($regel eq 'Regisseur') { |
897 |
} |
923 |
warn "seen 'Regisseur' twice" if defined $director; |
898 |
elsif ($regel eq 'Commentaar') { |
924 |
$director = $text; |
899 |
push @$commentators, $text; |
925 |
} |
900 |
} |
926 |
elsif ($regel eq 'Orginele titel') { |
901 |
elsif ($regel eq 'Jaar van premiere') { |
927 |
warn "seen 'Orginele titel' twice" if defined $orig_title; |
902 |
# Year of release, I think. |
928 |
$orig_title = $text; |
903 |
warn "seen 'Jaar van premiere' twice" |
929 |
} |
904 |
if defined $date; |
930 |
elsif ($regel eq 'Behaalde prijzen') { |
905 |
$date = $text; |
931 |
# Awards won. It doesn't seem worth adding a separate |
906 |
} |
932 |
# field for this to the XMLTV format, just append to |
907 |
elsif ($regel eq 'Regisseur') { |
933 |
# the description. |
908 |
warn "seen 'Regisseur' twice" if defined $director; |
934 |
# |
909 |
$director = $text; |
935 |
# push @desc, "$regel: $text"; |
910 |
} |
936 |
} |
911 |
elsif ($regel eq 'Orginele titel') { |
937 |
elsif ($regel eq 'Website') { |
912 |
warn "seen 'Orginele titel' twice" if defined $orig_title; |
938 |
push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text"; |
913 |
$orig_title = $text; |
939 |
} |
914 |
} |
940 |
else { |
915 |
elsif ($regel eq 'Behaalde prijzen') { |
941 |
# Unknown key, report it back to the prompt |
916 |
# Awards won. It doesn't seem worth adding a separate |
942 |
# |
917 |
# field for this to the XMLTV format, just append to |
943 |
warn "unknown programme info key $regel\n" |
918 |
# the description. |
944 |
unless $warned_regel{$regel}++; |
919 |
# |
945 |
} |
920 |
push @desc, "$regel: $text"; |
|
|
921 |
} |
922 |
elsif ($regel eq 'Website') { |
923 |
push @url, ($text =~ /^[a-z]+:/) ? $text : "http://$text"; |
924 |
} |
925 |
else { |
926 |
# Unknown key, but let's add it to the desc so we |
927 |
# don't lose information. These newlines are just for |
928 |
# the benefit of someone reading the XML by hand. |
929 |
# |
930 |
push @desc, "$regel: $text"; |
931 |
warn "unknown programme info key $regel\n" |
932 |
unless $warned_regel{$regel}++; |
933 |
} |
934 |
} |
946 |
} |
935 |
|
947 |
|
936 |
if (not defined $naam) { |
948 |
if (not defined $naam) { |
937 |
warn "did not see programme title, skipping programme\n"; |
949 |
warn "did not see programme title, skipping programme\n"; |
938 |
return; |
950 |
return; |
939 |
} |
951 |
} |
940 |
if (not defined $van) { |
952 |
if (not defined $van) { |
941 |
warn "did not see programme times, skipping programme\n"; |
953 |
warn "did not find programme starttime, skipping programme\n"; |
942 |
return; |
954 |
return; |
|
|
955 |
} |
956 |
if (not defined $tot) { |
957 |
warn "did not see programme endtime, skipping programme\n"; |
958 |
return; |
943 |
} |
959 |
} |
944 |
die if not defined $tot; |
|
|
945 |
|
960 |
|
946 |
my @title = ([ $naam, $LANG]); |
961 |
my @title = ([ $naam, $LANG]); |
947 |
push @title, [ $orig_title ] if defined $orig_title; # not Dutch! |
962 |
push @title, [ $orig_title ] if defined $orig_title; # not Dutch! |
Lines 983-1023
Link Here
|
983 |
return \%prog; |
998 |
return \%prog; |
984 |
} |
999 |
} |
985 |
|
1000 |
|
986 |
sub time_tot_str( $ ) { |
1001 |
sub time_to_str( $ ) { |
987 |
my $input = shift; |
1002 |
my $input = shift; |
988 |
if (length($input) == 15) { |
|
|
989 |
$input .= '05:00'; |
990 |
} |
991 |
my $datum = substr($input,0,length($input)-11); |
992 |
my $tot = substr($input,-5); |
993 |
|
994 |
$datum =~ /(\d\d?)-(\d\d?)-(\d+)/ |
995 |
or die "cannot find year in '$datum'"; |
996 |
my ($dd, $mm, $yyyy) = ($1, $2, $3); |
997 |
foreach ($dd, $mm) { $_ = "0$_" if length == 1 } |
998 |
$tot =~ /(\d\d):(\d\d)/ |
999 |
or die "cannot find time in '$tot'"; |
1000 |
my ($HH, $MM) = ($1, $2); |
1001 |
|
1003 |
|
1002 |
return "$yyyy-$mm-$dd $HH:$MM:00"; |
1004 |
# Replace months |
1003 |
} |
1005 |
$input =~ s/\bjanuari\b/1/g; |
1004 |
sub time_van_str( $ ) { |
1006 |
$input =~ s/\bfebruari\b/2/g; |
1005 |
my $input = shift; |
1007 |
$input =~ s/\bmaart\b/3/g; |
1006 |
if (length($input) == 15) { |
1008 |
$input =~ s/\bapril\b/4/g; |
1007 |
$input .= '06:00'; |
1009 |
$input =~ s/\bmei\b/5/g; |
|
|
1010 |
$input =~ s/\bjuni\b/6/g; |
1011 |
$input =~ s/\bjuli\b/7/g; |
1012 |
$input =~ s/\baugustus\b/8/g; |
1013 |
$input =~ s/\bseptember\b/9/g; |
1014 |
$input =~ s/\boktober\b/10/g; |
1015 |
$input =~ s/\bnovember\b/11/g; |
1016 |
$input =~ s/\bdecember\b/12/g; |
1017 |
|
1018 |
if( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d:\d\d)-(\d\d:\d\d) uur/ ) { |
1019 |
return ( "$3-$2-$1 $4:00", "$3-$2-$1 $5:00" ); |
1020 |
} |
1021 |
elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), -(\d\d):(\d\d) uur/ ) { |
1022 |
return ( "$4-$3-$2 $4:".($5-30).":00", "$3-$2-$1 $4:$5:00" ); |
1023 |
} |
1024 |
elsif( $input =~ /(\d\d?) (\d{1,2}) (\d{4}), (\d\d):(\d\d)- uur/ ) { |
1025 |
return ( "$3-$2-$1 $4:$5:00", "$3-$2-$1 $4:".($5+30).":00" ); |
1008 |
} |
1026 |
} |
1009 |
my $datum = substr($input,0,length($input)-11); |
|
|
1010 |
my $van = substr($input,-11); |
1011 |
$van = substr($van,0,5); |
1012 |
|
1013 |
$datum =~ /(\d\d?)-(\d\d?)-(\d{4})/ |
1014 |
or die "cannot find year in '$datum'"; |
1015 |
my ($dd, $mm, $yyyy) = ($1, $2, $3); |
1016 |
foreach ($dd, $mm) { $_ = "0$_" if length == 1 } |
1017 |
$van =~ /(\d\d):(\d\d)/ |
1018 |
or die "cannot find time in '$van'"; |
1019 |
my ($HH, $MM) = ($1, $2); |
1020 |
return "$yyyy-$mm-$dd $HH:$MM:00"; |
1021 |
} |
1027 |
} |
1022 |
|
1028 |
|
1023 |
|
1029 |
|
Lines 1032-1071
Link Here
|
1032 |
my %channels; |
1038 |
my %channels; |
1033 |
|
1039 |
|
1034 |
if ($sanity) { |
1040 |
if ($sanity) { |
1035 |
# Download what we can. |
1041 |
# Download what we can. |
1036 |
my $url = 'http://www.tvgids.nl'; |
1042 |
my $url = 'http://www.tvgids.nl'; |
1037 |
#All stations are in the select box. |
1043 |
#All stations are in the select box. |
1038 |
#The station ID is the option value |
1044 |
#The station ID is the option value |
1039 |
my $t = new HTML::TreeBuilder(); |
1045 |
my $t = new HTML::TreeBuilder(); |
1040 |
$t->parse(get_nice($url)); |
1046 |
$t->parse(get_nice($url)); |
1041 |
my @conts = map { [ $_->content_list() ] } |
1047 |
my @conts = map { [ $_->content_list() ] } |
1042 |
$t->look_down('_tag' => 'select', 'name' => 'station'); |
1048 |
$t->look_down('_tag' => 'select', 'name' => 'station'); |
1043 |
foreach my $cont (@conts) { |
1049 |
|
1044 |
my @children =@$cont; |
1050 |
foreach my $cont (@conts) { |
1045 |
if (scalar(@children) == 0) { |
1051 |
my @children =@$cont; |
1046 |
warn 'No stations are defined'; |
1052 |
|
1047 |
next; |
1053 |
if (scalar(@children) == 0) { |
1048 |
} |
1054 |
warn 'No stations are defined'; |
1049 |
foreach my $station_line (@children) { |
1055 |
next; |
1050 |
if ($station_line ne ' ') { |
1056 |
} |
1051 |
#This if statement is to prevent parsing the last |
1057 |
|
1052 |
#empty element from the list. |
1058 |
foreach my $station_line (@children) { |
1053 |
|
1059 |
if ($station_line ne ' ') { |
1054 |
my $channel_id = $station_line->attr('value'); |
1060 |
# This if statement is to prevent parsing the last |
1055 |
|
1061 |
# empty element from the list. |
1056 |
#I am only interested in the normal channels. |
1062 |
|
1057 |
#tvgids.nl has some pages for the regional stations also |
1063 |
my $channel_id = $station_line->attr('value'); |
1058 |
#All normal channels have id <0,100> |
1064 |
|
1059 |
#That is at the moment... Could change in future... |
1065 |
# I am only interested in the normal channels. |
1060 |
if ($channel_id > 0 && $channel_id < 100) { |
1066 |
# tvgids.nl has some pages for the regional stations also |
1061 |
my $channel_name = $station_line->as_text(); |
1067 |
# All normal channels have id <0,300> |
1062 |
$channels{$channel_id} = $channel_name |
1068 |
# That is at the moment... Could change in future... |
1063 |
} |
1069 |
if ($channel_id && $channel_id > 0 && $channel_id < 300) { |
1064 |
} |
1070 |
print $channel_id."\n"; |
1065 |
|
1071 |
my $channel_name = $station_line->as_text(); |
1066 |
} |
1072 |
$channels{$channel_id} = $channel_name |
1067 |
} |
1073 |
} |
1068 |
$t->delete(); undef $t; |
1074 |
} |
|
|
1075 |
} |
1076 |
} |
1077 |
|
1078 |
$t->delete(); undef $t; |
1069 |
} |
1079 |
} |
1070 |
|
1080 |
|
1071 |
# Now read the config file and check against any data we already |
1081 |
# Now read the config file and check against any data we already |
Lines 1138-1288
Link Here
|
1138 |
# or else drop it with a warning. |
1148 |
# or else drop it with a warning. |
1139 |
# |
1149 |
# |
1140 |
my %warned_bad_channel_name; |
1150 |
my %warned_bad_channel_name; |
|
|
1151 |
|
1141 |
sub process_summary_page( $$$$ ) { |
1152 |
sub process_summary_page( $$$$ ) { |
1142 |
my ($url, $day, $official_day, $ch_name_ref) = @_; |
1153 |
my ($url, $day, $official_day, $ch_name_ref) = @_; |
1143 |
die if not defined $url; die if ref $url; |
1154 |
die if not defined $url; die if ref $url; |
1144 |
die if not defined $day; die if ref $day; |
1155 |
die if not defined $day; die if ref $day; |
1145 |
die if not defined $official_day; die if ref $official_day; |
1156 |
die if not defined $official_day; die if ref $official_day; |
|
|
1157 |
|
1146 |
local $SIG{__WARN__} = sub { |
1158 |
local $SIG{__WARN__} = sub { |
1147 |
warn "$url: $_[0]"; |
1159 |
warn "$url: $_[0]"; |
1148 |
}; |
1160 |
}; |
|
|
1161 |
|
1149 |
local $SIG{__DIE__} = sub { |
1162 |
local $SIG{__DIE__} = sub { |
1150 |
die "$url: $_[0]"; |
1163 |
die "$url: $_[0]"; |
1151 |
}; |
1164 |
}; |
|
|
1165 |
|
1152 |
my $t = new HTML::TreeBuilder; |
1166 |
my $t = new HTML::TreeBuilder; |
1153 |
$t->parse(get_nice($url)); |
1167 |
$t->parse(get_nice($url)); |
1154 |
my %interesting; ++ $interesting{$_} foreach |
1168 |
my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht'); |
1155 |
qw(lijst_zender lijst_tijd details_programma); |
1169 |
|
1156 |
my @elems = $t->look_down(sub { die if not defined $_[0]; |
|
|
1157 |
my $c = $_[0]->attr('class'); |
1158 |
return 0 if not defined $c; |
1159 |
return $interesting{$c} }); |
1160 |
if (not @elems) { |
1170 |
if (not @elems) { |
1161 |
warn 'did not find any programmes in page'; |
1171 |
warn 'did not find any programmes in page'; |
1162 |
return (); |
1172 |
return (); |
1163 |
} |
1173 |
} |
|
|
1174 |
|
1164 |
my @bits; |
1175 |
my @bits; |
|
|
1176 |
|
1165 |
foreach my $e (@elems) { |
1177 |
foreach my $e (@elems) { |
1166 |
my $class = $e->attr('class'); |
1178 |
my @cont = $e->look_down('_tag' => 'tr'); |
1167 |
t "looking at elem of class $class"; |
1179 |
my @elems = $t->look_down('_tag' => 'table', 'class' => 'overzicht'); |
1168 |
my $href; |
1180 |
|
1169 |
check_content: |
1181 |
if (not @elems) { |
1170 |
my @cont = $e->content_list(); |
1182 |
warn 'did not find any programmes in page'; |
1171 |
my $got_href = $e->attr('href'); |
1183 |
return (); |
1172 |
if (defined $got_href) { |
1184 |
} |
1173 |
t "got href: $got_href"; |
|
|
1174 |
warn "seen 'href's contained inside each other" |
1175 |
if defined $href; |
1176 |
$href = $got_href; |
1177 |
} |
1178 |
if (not @cont) { |
1179 |
warn "found $class elem without content, ignoring" |
1180 |
unless $class eq 'lijst_zender'; |
1181 |
next; |
1182 |
} |
1183 |
elsif (@cont == 1) { |
1184 |
for ($cont[0]) { |
1185 |
if (ref) { |
1186 |
# Unpack this extra layer of element. |
1187 |
$e = $_; |
1188 |
goto check_content; |
1189 |
} |
1190 |
s/^\s+//; s/\s+$//; |
1191 |
push @bits, [ $class, $href, $_ ]; |
1192 |
} |
1193 |
} |
1194 |
elsif (@cont > 1) { |
1195 |
warn "found $class elem with more than one elem inside, ignoring" |
1196 |
unless $class eq 'lijst_zender'; |
1197 |
next; |
1198 |
} |
1199 |
} |
1185 |
} |
1200 |
$t->delete(); undef $t; |
1186 |
|
|
|
1187 |
foreach my $e (@elems) { |
1188 |
my @h4elems = $e->look_down('_tag' => 'h4'); |
1189 |
if (not @h4elems) { |
1190 |
warn 'did not find any date in page'; |
1191 |
return (); |
1192 |
} |
1193 |
else { |
1194 |
push @bits, [ $h4elems[0]->as_text(), $_ ]; |
1195 |
} |
1196 |
|
1197 |
my @cont = $e->look_down('_tag' => 'tr'); |
1198 |
foreach my $tr (@cont) { |
1199 |
# Fetch the time |
1200 |
my @ths = $tr->look_down('_tag' => 'th'); |
1201 |
my @tds = $tr->look_down('_tag' => 'td'); |
1202 |
|
1203 |
if( @ths == 1 and @tds > 1 ) { |
1204 |
my $th = $ths[0]; my $td1 = $tds[0]; my $td2 = $tds[1]; |
1205 |
my @ahs = $td1->look_down('_tag' => 'a'); |
1206 |
if( @ahs == 1 ) { |
1207 |
push @bits, [ $th->as_text(), $td1->as_text(), $td2->as_text(), $ahs[0]->attr('href'), $_ ]; |
1208 |
} |
1209 |
} |
1210 |
} |
1211 |
|
1212 |
} |
1213 |
|
1214 |
$t->delete(); |
1215 |
undef $t; |
1216 |
|
1201 |
if (not @bits) { |
1217 |
if (not @bits) { |
1202 |
warn "did not see any content, skipping page"; |
1218 |
warn "did not see any content, skipping page"; |
1203 |
return (); |
1219 |
return (); |
1204 |
} |
1220 |
} |
|
|
1221 |
|
1205 |
t 'got bits: ' . d @bits; |
1222 |
t 'got bits: ' . d @bits; |
1206 |
if ($bits[0]->[0] eq 'lijst_zender') { |
1223 |
|
1207 |
my $date_str = $bits[0]->[2]; |
1224 |
my $date_str = $bits[0]->[0]; |
1208 |
my $d = parse_dutch_date($date_str); |
1225 |
my $d = parse_dutch_date($date_str); |
1209 |
if (defined $d) { |
1226 |
if (defined $d) { |
1210 |
my ($d_base, $d_tz) = @{date_to_local($d, $TZ)}; |
1227 |
my ($d_base, $d_tz) = @{date_to_local($d, $TZ)}; |
1211 |
die "date in page $d_base ($date_str) doesn't match expected $official_day" |
1228 |
die "date in page $d_base ($date_str) doesn't match expected $official_day" |
1212 |
if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q'); |
1229 |
if UnixDate($d_base, '%Q') ne UnixDate($official_day, '%Q'); |
1213 |
shift @bits; |
1230 |
shift @bits; |
1214 |
} |
1231 |
} |
1215 |
# otherwise, leave it for later processing |
1232 |
|
1216 |
} |
|
|
1217 |
my @todo; |
1233 |
my @todo; |
1218 |
while (@bits >= 3) { |
1234 |
|
1219 |
my $ch_bit = shift @bits; |
1235 |
while (@bits > 0) { |
1220 |
my $ch_class = $ch_bit->[0]; |
1236 |
my $bit = shift @bits; |
1221 |
if ($ch_class ne 'lijst_zender') { |
1237 |
|
1222 |
warn "bit expected to be channel name has class $ch_class not lijst_zender, skipping"; |
1238 |
my $ch = $bit->[2]; |
1223 |
next; |
1239 |
t 'shifted bit to get $ch=' . d $ch; |
1224 |
} |
1240 |
for ($$ch_name_ref) { |
1225 |
my $ch = $ch_bit->[2]; |
1241 |
if (not defined) { $_ = $ch } |
1226 |
t 'shifted bit to get $ch=' . d $ch; |
1242 |
elsif ($_ ne $ch) { |
1227 |
for ($$ch_name_ref) { |
1243 |
# Most likely the channels file has the wrong name. |
1228 |
if (not defined) { $_ = $ch } |
1244 |
warn "expected channel name $_, got $ch\n" |
1229 |
elsif ($_ ne $ch) { |
1245 |
unless $warned_bad_channel_name{$_}{$ch}++; |
1230 |
# Most likely the channels file has the wrong name. |
1246 |
$_ = $ch; |
1231 |
warn "expected channel name $_, got $ch\n" |
1247 |
} |
1232 |
unless $warned_bad_channel_name{$_}{$ch}++; |
1248 |
} |
1233 |
$_ = $ch; |
|
|
1234 |
} |
1235 |
} |
1236 |
|
1249 |
|
1237 |
my $times_bit = shift @bits; |
1250 |
my $times = $bit->[0]; |
1238 |
my $times_class = $times_bit->[0]; |
1251 |
t 'shifted bit to get $times=' . d $times; |
1239 |
if ($times_class ne 'lijst_tijd') { |
1252 |
|
1240 |
warn "bit expected to be times has class $times_class not lijst_tijd, skipping"; |
1253 |
my $title_href = $bit->[3]; |
1241 |
next; |
1254 |
my $title = $bit->[1]; |
1242 |
} |
1255 |
t 'shifted bit to get $title_href=' . d $title_href . ', ' |
1243 |
my $times = $times_bit->[2]; |
1256 |
. '$title=' . d $title; |
1244 |
t 'shifted bit to get $times=' . d $times; |
1257 |
|
|
|
1258 |
if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) { |
1259 |
# A certain programme on the hour. But it isn't worth |
1260 |
# adding this to the output, ignore it. |
1261 |
# |
1262 |
next; |
1263 |
} |
1264 |
|
1265 |
if ($title eq 'NB: Programmering onder voorbehoud') { |
1266 |
# Programming subject to change. There isn't a way to |
1267 |
# represent this in the current XMLTV format. |
1268 |
# |
1269 |
next; |
1270 |
} |
1245 |
|
1271 |
|
1246 |
my $title_bit = shift @bits; |
1272 |
my ($start_hhmm, $stop_hhmm); |
1247 |
my $title_class = $title_bit->[0]; |
1273 |
if ($times =~ /^(\d\d):(\d\d) - / |
1248 |
if ($title_class ne 'details_programma') { |
1274 |
and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { |
1249 |
warn "bit expected to be title has class $title_class not details_programma, skipping"; |
1275 |
$start_hhmm = "$1:$2"; |
1250 |
next; |
1276 |
} |
1251 |
} |
1277 |
if ($times =~ / - (\d\d):(\d\d)$/ |
1252 |
my $title_href = $title_bit->[1]; |
1278 |
and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { |
1253 |
my $title = $title_bit->[2]; |
1279 |
$stop_hhmm = "$1:$2"; |
1254 |
t 'shifted bit to get $title_href=' . d $title_href . ', ' |
1280 |
} |
1255 |
. '$title=' . d $title; |
1281 |
|
1256 |
|
1282 |
# Right, got channel name, times, and title. |
1257 |
if ($title =~ /^Ieder heel uur .+, tenzij anders vermeld$/) { |
1283 |
# FIXME should check channel name (among other things). |
1258 |
# A certain programme on the hour. But it isn't worth |
1284 |
# |
1259 |
# adding this to the output, ignore it. |
1285 |
my $title_url = URI->new_abs($title_href, $url); |
1260 |
# |
1286 |
|
1261 |
next; |
1287 |
# We prefer to handle URLs as strings. |
1262 |
} |
1288 |
push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ]; |
1263 |
if ($title eq 'NB: Programmering onder voorbehoud') { |
|
|
1264 |
# Programming subject to change. There isn't a way to |
1265 |
# represent this in the current XMLTV format. |
1266 |
# |
1267 |
next; |
1268 |
} |
1269 |
|
1270 |
my ($start_hhmm, $stop_hhmm); |
1271 |
if ($times =~ /^(\d\d):(\d\d)-/ |
1272 |
and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { |
1273 |
$start_hhmm = "$1:$2"; |
1274 |
} |
1275 |
if ($times =~ /-(\d\d):(\d\d)$/ |
1276 |
and 0 <= $1 and $1 < 24 and 0 <= $2 and $2 < 60) { |
1277 |
$stop_hhmm = "$1:$2"; |
1278 |
} |
1279 |
|
1280 |
# Right, got channel name, times, and title. |
1281 |
# FIXME should check channel name (among other things). |
1282 |
# |
1283 |
my $title_url = URI->new_abs($title_href, $url); |
1284 |
# We prefer to handle URLs as strings. |
1285 |
push @todo, [ $start_hhmm, $stop_hhmm, $title, "$title_url" ]; |
1286 |
} |
1289 |
} |
1287 |
|
1290 |
|
1288 |
# Now we need to make some sense of the times. When stop time |
1291 |
# Now we need to make some sense of the times. When stop time |
Lines 1296-1386
Link Here
|
1296 |
my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d'); |
1299 |
my $next_day = UnixDate(DateCalc($day, '+ 1 day'), '%Y-%m-%d'); |
1297 |
my $crossing_at; |
1300 |
my $crossing_at; |
1298 |
# local $Log::TraceMessages::On = 1; |
1301 |
# local $Log::TraceMessages::On = 1; |
|
|
1302 |
|
1299 |
t 'looking at raw times and searching for midnight crossing: ' . d \@todo; |
1303 |
t 'looking at raw times and searching for midnight crossing: ' . d \@todo; |
|
|
1304 |
|
1300 |
for (my $i = 0; $i < @todo; ++$i) { |
1305 |
for (my $i = 0; $i < @todo; ++$i) { |
1301 |
my ($start_hhmm, $stop_hhmm) = @{$todo[$i]}; |
1306 |
my ($start_hhmm, $stop_hhmm) = @{$todo[$i]}; |
1302 |
t '$start_hhmm=' . d $start_hhmm; |
1307 |
t '$start_hhmm=' . d $start_hhmm; |
1303 |
t '$stop_hhmm=' . d $stop_hhmm; |
1308 |
t '$stop_hhmm=' . d $stop_hhmm; |
1304 |
next if not defined $start_hhmm; |
1309 |
next if not defined $start_hhmm; |
1305 |
next if not defined $stop_hhmm; |
1310 |
next if not defined $stop_hhmm; |
1306 |
|
1311 |
|
1307 |
my $start = parse_local_date("$day $start_hhmm", $TZ); |
1312 |
my $start = parse_local_date("$day $start_hhmm", $TZ); |
1308 |
my $stop = parse_local_date("$day $stop_hhmm", $TZ); |
1313 |
my $stop = parse_local_date("$day $stop_hhmm", $TZ); |
1309 |
t "checking if $start -> $stop goes backwards"; |
1314 |
t "checking if $start -> $stop goes backwards"; |
1310 |
next if Date_Cmp($start, $stop) <= 0; |
1315 |
next if Date_Cmp($start, $stop) <= 0; |
1311 |
t "yup, it's a candidate"; |
1316 |
t "yup, it's a candidate"; |
1312 |
|
1317 |
|
1313 |
my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ); |
1318 |
my $stop_next_day = parse_local_date("$next_day $stop_hhmm", $TZ); |
1314 |
die if Date_Cmp($stop_next_day, $start) <= 0; |
1319 |
die if Date_Cmp($stop_next_day, $start) <= 0; |
1315 |
t 'if it were, stop time on next day would be: ' . d $stop_next_day; |
1320 |
t 'if it were, stop time on next day would be: ' . d $stop_next_day; |
1316 |
|
1321 |
|
1317 |
my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st'); |
1322 |
my $distance = Delta_Format(DateCalc($start, $stop_next_day), 0, '%st'); |
1318 |
t '...and length of programme: ' . d $distance; |
1323 |
t '...and length of programme: ' . d $distance; |
1319 |
t 'shortest length so far: ' . d $shortest; |
1324 |
t 'shortest length so far: ' . d $shortest; |
1320 |
if (not defined $shortest or $distance < $shortest) { |
1325 |
|
1321 |
t 'this is the best so far'; |
1326 |
if (not defined $shortest or $distance < $shortest) { |
1322 |
$shortest = $distance; |
1327 |
t 'this is the best so far'; |
1323 |
$crossing_at = $i; |
1328 |
$shortest = $distance; |
1324 |
} |
1329 |
$crossing_at = $i; |
|
|
1330 |
} |
1325 |
} |
1331 |
} |
|
|
1332 |
|
1326 |
t '@todo=' . d \@todo; |
1333 |
t '@todo=' . d \@todo; |
1327 |
|
1334 |
|
1328 |
# Now given the place at which we cross from $day to $next_day we |
1335 |
# Now given the place at which we cross from $day to $next_day we |
1329 |
# can add the appropriate days to the hh:mm times. |
1336 |
# can add the appropriate days to the hh:mm times. |
1330 |
# |
1337 |
# |
1331 |
if (not defined $crossing_at) { |
1338 |
if (not defined $crossing_at) { |
1332 |
push @$_, $day, $day foreach @todo; |
1339 |
push @$_, $day, $day foreach @todo; |
1333 |
} |
1340 |
} |
1334 |
else { |
1341 |
else { |
1335 |
for (my $i = 0; $i < $crossing_at; ++$i) { |
1342 |
for (my $i = 0; $i < $crossing_at; ++$i) { |
1336 |
push @{$todo[$i]}, $day, $day; |
1343 |
push @{$todo[$i]}, $day, $day; |
1337 |
} |
1344 |
} |
1338 |
for (my $i = $crossing_at) { |
1345 |
for (my $i = $crossing_at) { |
1339 |
push @{$todo[$i]}, $day, $next_day; |
1346 |
push @{$todo[$i]}, $day, $next_day; |
1340 |
} |
1347 |
} |
1341 |
for (my $i = $crossing_at + 1; $i < @todo; ++$i) { |
1348 |
for (my $i = $crossing_at + 1; $i < @todo; ++$i) { |
1342 |
push @{$todo[$i]}, $next_day, $next_day; |
1349 |
push @{$todo[$i]}, $next_day, $next_day; |
1343 |
} |
1350 |
} |
1344 |
} |
1351 |
} |
1345 |
|
1352 |
|
1346 |
# Now we can parse the dates into Date::Manip objects. |
1353 |
# Now we can parse the dates into Date::Manip objects. |
1347 |
my @r; |
1354 |
my @r; |
1348 |
foreach (@todo) { |
1355 |
foreach (@todo) { |
1349 |
my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_; |
1356 |
my ($start_hhmm, $stop_hhmm, $title, $title_url, $start_day, $stop_day) = @$_; |
1350 |
my ($start, $stop); |
1357 |
my ($start, $stop); |
1351 |
if (defined $start_hhmm) { |
1358 |
if (defined $start_hhmm) { |
1352 |
$start = parse_local_date("$start_day $start_hhmm", $TZ); |
1359 |
$start = parse_local_date("$start_day $start_hhmm", $TZ); |
1353 |
} |
1360 |
} |
1354 |
if (defined $stop_hhmm) { |
1361 |
if (defined $stop_hhmm) { |
1355 |
$stop = parse_local_date("$stop_day $stop_hhmm", $TZ); |
1362 |
$stop = parse_local_date("$stop_day $stop_hhmm", $TZ); |
1356 |
} |
1363 |
} |
1357 |
push @r, [ $start, $stop, $title, $title_url ]; |
1364 |
push @r, [ $start, $stop, $title, $title_url ]; |
1358 |
} |
1365 |
} |
|
|
1366 |
|
1359 |
t 'after parsing dates: ' . d \@r; |
1367 |
t 'after parsing dates: ' . d \@r; |
1360 |
|
1368 |
|
1361 |
# Check the dates and weed out those which are obviously wrong. |
1369 |
# Check the dates and weed out those which are obviously wrong. |
1362 |
my $last_start; |
1370 |
my $last_start; |
1363 |
foreach (@r) { |
1371 |
foreach (@r) { |
1364 |
our ($start, $stop); |
1372 |
our ($start, $stop); |
1365 |
local (*start, *stop) = \ ($_->[0], $_->[1]); |
1373 |
local (*start, *stop) = \ ($_->[0], $_->[1]); |
1366 |
t 'checking dates, $last_start=' . d $last_start; |
1374 |
t 'checking dates, $last_start=' . d $last_start; |
1367 |
t '$start=' . d $start; |
1375 |
t '$start=' . d $start; |
1368 |
t '$stop=' . d $stop; |
1376 |
t '$stop=' . d $stop; |
1369 |
if (defined $start and defined $stop |
1377 |
|
1370 |
and Date_Cmp($start, $stop) > 0) { |
1378 |
if (defined $start and defined $stop |
1371 |
# Appears to stop before it starts. Assume the stop time |
1379 |
and Date_Cmp($start, $stop) > 0) { |
1372 |
# is bogus but the start time might be okay. |
1380 |
# Appears to stop before it starts. Assume the stop time |
1373 |
# |
1381 |
# is bogus but the start time might be okay. |
1374 |
undef $stop; |
1382 |
# |
1375 |
} |
1383 |
undef $stop; |
1376 |
if (defined $last_start) { |
1384 |
} |
1377 |
if (defined $start and Date_Cmp($start, $last_start) < 0) { |
1385 |
|
1378 |
# Appears to start before previous start. |
1386 |
if (defined $last_start) { |
1379 |
undef $start; |
1387 |
if (defined $start and Date_Cmp($start, $last_start) < 0) { |
1380 |
} |
1388 |
# Appears to start before previous start. |
|
|
1389 |
undef $start; |
1390 |
} |
1381 |
if (defined $stop and Date_Cmp($stop, $last_start) < 0) { |
1391 |
if (defined $stop and Date_Cmp($stop, $last_start) < 0) { |
1382 |
# Stops before previous start - that's just as bad. |
1392 |
# Stops before previous start - that's just as bad. |
1383 |
undef $stop; |
1393 |
undef $stop; |
1384 |
} |
1394 |
} |
1385 |
} |
1395 |
} |
1386 |
$last_start = $start if defined $start; |
1396 |
$last_start = $start if defined $start; |
Lines 1389-1407
Link Here
|
1389 |
|
1399 |
|
1390 |
# If there is a 'next page' link do a recursive call to handle it. |
1400 |
# If there is a 'next page' link do a recursive call to handle it. |
1391 |
foreach (@bits) { |
1401 |
foreach (@bits) { |
1392 |
my ($type, $href, $text) = @$_; |
1402 |
my ($type, $href, $text) = @$_; |
1393 |
if ($type eq 'lijst_zender' and $text eq 'Volgende') { |
1403 |
if ($type eq 'lijst_zender' and $text eq 'Volgende') { |
1394 |
# Next page for this day. |
1404 |
# Next page for this day. |
1395 |
my $next_url = URI->new_abs($href, $url); |
1405 |
my $next_url = URI->new_abs($href, $url); |
1396 |
# Turn URL back into a string. |
1406 |
# Turn URL back into a string. |
1397 |
push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref) |
1407 |
push @r, process_summary_page("$next_url", $day, $official_day, $ch_name_ref) |
1398 |
} |
1408 |
} |
1399 |
elsif ($type eq 'lijst_zender' and $text eq 'Vorige') { |
1409 |
elsif ($type eq 'lijst_zender' and $text eq 'Vorige') { |
1400 |
# Previous page for this day, assume already fetched. |
1410 |
# Previous page for this day, assume already fetched. |
1401 |
} |
1411 |
} |
1402 |
else { |
1412 |
else { |
1403 |
warn "discarding leftover $type: $text"; |
1413 |
warn "discarding leftover $type: $text"; |
1404 |
} |
1414 |
} |
1405 |
} |
1415 |
} |
1406 |
|
1416 |
|
1407 |
t 'returning tuples: ' . d \@r; |
1417 |
t 'returning tuples: ' . d \@r; |
Lines 1448-1460
Link Here
|
1448 |
s/\bdecember\b/December/g; |
1458 |
s/\bdecember\b/December/g; |
1449 |
s/\bdec\b/December/g; |
1459 |
s/\bdec\b/December/g; |
1450 |
|
1460 |
|
1451 |
s/\bzondag\b/Sunday/g; |
1461 |
s/\bZondag\b/Sunday/g; |
1452 |
s/\bmaandag\b/Monday/g; |
1462 |
s/\bMaandag\b/Monday/g; |
1453 |
s/\bdinsdag\b/Tuesday/g; |
1463 |
s/\bDinsdag\b/Tuesday/g; |
1454 |
s/\bwoensdag\b/Wednesday/g; |
1464 |
s/\bWoensdag\b/Wednesday/g; |
1455 |
s/\bdonderdag\b/Thursday/g; |
1465 |
s/\bDonderdag\b/Thursday/g; |
1456 |
s/\bvrijdag\b/Friday/g; |
1466 |
s/\bVrijdag\b/Friday/g; |
1457 |
s/\bzaterdag\b/Saturday/g; |
1467 |
s/\bZaterdag\b/Saturday/g; |
1458 |
|
1468 |
|
1459 |
my $r; |
1469 |
my $r; |
1460 |
eval { $r = parse_local_date($_, $TZ) }; |
1470 |
eval { $r = parse_local_date($_, $TZ) }; |