Lines 138-143
BEGIN {
Link Here
|
138 |
else { |
138 |
else { |
139 |
*t = \&Log::TraceMessages::t; |
139 |
*t = \&Log::TraceMessages::t; |
140 |
*d = \&Log::TraceMessages::d; |
140 |
*d = \&Log::TraceMessages::d; |
|
|
141 |
#$Log::TraceMessages::On = 1; |
141 |
} |
142 |
} |
142 |
} |
143 |
} |
143 |
|
144 |
|
Lines 502-510
sub parse_page($$) {
Link Here
|
502 |
} |
503 |
} |
503 |
|
504 |
|
504 |
#-- extract date of grabbed data from retrieved webpage ... |
505 |
#-- extract date of grabbed data from retrieved webpage ... |
505 |
$_ = $page->look_down('_tag' => 'td', 'class' => 'navigator-hhead-large'); |
506 |
$_ = $page->look_down('_tag' => 'span', 'class' => 'text-weiss'); |
506 |
die("cannot find date on requested page") |
507 |
die("cannot find date on requested page") |
507 |
unless($_->as_text() =~ m/([1-3]?[0-9])\.(1?[0-9])\.(20[0-9]{2})/); |
508 |
unless($_->as_text() =~ m/([1-3]?[0-9])\.(1?[0-9])\.(20[0-9]{2})/); |
|
|
509 |
t "extracted date: $3-$2-$1"; |
508 |
$day = ParseDate("$3-$2-$1 00:00:00"); |
510 |
$day = ParseDate("$3-$2-$1 00:00:00"); |
509 |
|
511 |
|
510 |
#-- well, now let's scan the table for programme data |
512 |
#-- well, now let's scan the table for programme data |
Lines 573-578
sub parse_page($$) {
Link Here
|
573 |
$show{q(episode-num)} = [ [ $1, "onscreen" ] ]; |
575 |
$show{q(episode-num)} = [ [ $1, "onscreen" ] ]; |
574 |
} |
576 |
} |
575 |
|
577 |
|
|
|
578 |
t "show title: $span"; |
576 |
$show{title} = [[ $span, $lang ]]; |
579 |
$show{title} = [[ $span, $lang ]]; |
577 |
} |
580 |
} |
578 |
elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") { |
581 |
elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") { |
Lines 583-589
sub parse_page($$) {
Link Here
|
583 |
|
586 |
|
584 |
my $title = ($tag->content_list())[0]; |
587 |
my $title = ($tag->content_list())[0]; |
585 |
|
588 |
|
586 |
$title = convert_cp1252_chars(\$title); |
589 |
convert_cp1252_chars(\$title); |
587 |
|
590 |
|
588 |
$title =~ s/\s*\([^\(]+\)\s*$//; |
591 |
$title =~ s/\s*\([^\(]+\)\s*$//; |
589 |
if ($title =~ s/\s*(\d+)\.\sTeil//gi) { |
592 |
if ($title =~ s/\s*(\d+)\.\sTeil//gi) { |
Lines 836-841
sub squeeze_out_desc($$) {
Link Here
|
836 |
# try to match <category>, <country> <year>; R: <names>; D: <names> construct |
839 |
# try to match <category>, <country> <year>; R: <names>; D: <names> construct |
837 |
# where <country>/<year> or the [RD]: stuff may be missing ... |
840 |
# where <country>/<year> or the [RD]: stuff may be missing ... |
838 |
if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;0-9]+)(,?\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) { |
841 |
if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;0-9]+)(,?\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) { |
|
|
842 |
t "split rule: <category>, <country> <year> ..."; |
839 |
$$desc = ""; |
843 |
$$desc = ""; |
840 |
|
844 |
|
841 |
#-- $parts[1] is the show title in English (doesn't have to be available) |
845 |
#-- $parts[1] is the show title in English (doesn't have to be available) |
Lines 887-895
sub squeeze_out_desc($$) {
Link Here
|
887 |
} |
891 |
} |
888 |
} |
892 |
} |
889 |
else { |
893 |
else { |
|
|
894 |
t "split rule: dot splitting"; |
890 |
my @data = split "·", $$desc; |
895 |
my @data = split "·", $$desc; |
891 |
s/(^\s|\s$)//g foreach(@data); #CHG# |
896 |
s/(^\s|\s$)//g foreach(@data); #CHG# |
892 |
|
897 |
|
|
|
898 |
for(0 .. (scalar(@data) - 1)) { |
899 |
t "dot-split part $_: " . $data[$_]; |
900 |
} |
901 |
|
893 |
if(scalar(@data) == 3 |
902 |
if(scalar(@data) == 3 |
894 |
&& not($data[1] =~ m/[\wäöüßÄÖÜ]+:/) #- FIX false positive: tvtoday.de seems to publish "guests: <names>" here some (rare) times :-( |
903 |
&& not($data[1] =~ m/[\wäöüßÄÖÜ]+:/) #- FIX false positive: tvtoday.de seems to publish "guests: <names>" here some (rare) times :-( |
895 |
&& $data[2] =~ m/^Mit (.*?)$/) { |
904 |
&& $data[2] =~ m/^Mit (.*?)$/) { |
Lines 945-959
sub squeeze_out_desc($$) {
Link Here
|
945 |
next; |
954 |
next; |
946 |
} |
955 |
} |
947 |
|
956 |
|
948 |
if (my ($cat, $rest1, $names, $guests, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?) - Gäste: (..+?)(?:\s+-\s+(.+))?$/) { |
957 |
if (my ($nocat, $cat, $rest1, $names, $guests, $rest2) = m/^(([^,.%^&*();]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?) - Gäste: (..+?)(?:\s+-\s+(.+))?$/) { |
949 |
my @data = split_up_names($names, $show); |
958 |
my @data = split_up_names($names, $show); |
950 |
push @{$show->{"credits"}{"presenter"}}, @data; |
959 |
push @{$show->{"credits"}{"presenter"}}, @data; |
951 |
my @guest_data = split_up_names($guests, $show); |
960 |
my @guest_data = split_up_names($guests, $show); |
952 |
push @{$show->{"credits"}{"guest"}}, @guest_data; |
961 |
push @{$show->{"credits"}{"guest"}}, @guest_data; |
953 |
$show->{"category"} = [[ $cat, $lang ]]; |
962 |
|
954 |
|
963 |
if(defined($cat)) { |
955 |
warn "misdetected category: $cat" |
964 |
$show->{"category"} = [[ $cat, $lang ]]; |
956 |
if($cat =~ m/\d{4}/); |
965 |
|
|
|
966 |
warn "misdetected category: $cat" |
967 |
if($cat =~ m/\d{4}/); |
968 |
} |
969 |
else { |
970 |
t "no-cat match: $nocat"; |
971 |
$rest1 = $nocat; |
972 |
} |
957 |
|
973 |
|
958 |
my @rest; |
974 |
my @rest; |
959 |
foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) { |
975 |
foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) { |
Lines 963-976
sub squeeze_out_desc($$) {
Link Here
|
963 |
next unless length($_); |
979 |
next unless length($_); |
964 |
} |
980 |
} |
965 |
|
981 |
|
966 |
if (my ($cat, $rest1, $names, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) { |
982 |
if (my ($nocat, $cat, $rest1, $names, $rest2) = m/^(([^,]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) { |
967 |
my @data = split_up_names($names, $show); |
983 |
my @data = split_up_names($names, $show); |
968 |
push @{$show->{"credits"}{"presenter"}}, @data; |
984 |
push @{$show->{"credits"}{"presenter"}}, @data; |
969 |
$show->{"category"} = [[ $cat, $lang ]]; |
|
|
970 |
|
985 |
|
971 |
warn "misdetected category: $cat" |
986 |
if(defined($cat)) { |
972 |
if($cat =~ m/\d{4}/); |
987 |
$show->{"category"} = [[ $cat, $lang ]]; |
973 |
|
988 |
|
|
|
989 |
warn "misdetected category: $cat" |
990 |
if($cat =~ m/\d{4}/); |
991 |
} |
992 |
else { |
993 |
t "no-cat match: $nocat"; |
994 |
$rest1 = $nocat; |
995 |
} |
996 |
|
974 |
my @rest; |
997 |
my @rest; |
975 |
foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) { |
998 |
foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) { |
976 |
push @rest, $_ if(defined($_) && length($_)); |
999 |
push @rest, $_ if(defined($_) && length($_)); |