Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 95166 | Differences between
and this patch

Collapse All | Expand All

(-)grab/de_tvtoday/tv_grab_de_tvtoday.in (-12 / +35 lines)
Lines 138-143 BEGIN { Link Here
138
    else {
138
    else {
139
        *t = \&Log::TraceMessages::t;
139
        *t = \&Log::TraceMessages::t;
140
        *d = \&Log::TraceMessages::d;
140
        *d = \&Log::TraceMessages::d;
141
	#$Log::TraceMessages::On = 1;
141
    }
142
    }
142
}
143
}
143
144
Lines 502-510 sub parse_page($$) { Link Here
502
    }
503
    }
503
	
504
	
504
    #-- extract date of grabbed data from retrieved webpage ...
505
    #-- extract date of grabbed data from retrieved webpage ...
505
    $_ = $page->look_down('_tag' => 'td', 'class' => 'navigator-hhead-large');
506
    $_ = $page->look_down('_tag' => 'span', 'class' => 'text-weiss');
506
    die("cannot find date on requested page") 
507
    die("cannot find date on requested page") 
507
      unless($_->as_text() =~ m/([1-3]?[0-9])\.(1?[0-9])\.(20[0-9]{2})/);
508
      unless($_->as_text() =~ m/([1-3]?[0-9])\.(1?[0-9])\.(20[0-9]{2})/);
509
    t "extracted date: $3-$2-$1";
508
    $day = ParseDate("$3-$2-$1 00:00:00");
510
    $day = ParseDate("$3-$2-$1 00:00:00");
509
511
510
    #-- well, now let's scan the table for programme data
512
    #-- well, now let's scan the table for programme data
Lines 573-578 sub parse_page($$) { Link Here
573
		$show{q(episode-num)} = [ [ $1, "onscreen" ] ];
575
		$show{q(episode-num)} = [ [ $1, "onscreen" ] ];
574
	    }
576
	    }
575
577
578
	    t "show title: $span";
576
	    $show{title} = [[ $span, $lang ]];
579
	    $show{title} = [[ $span, $lang ]];
577
	} 
580
	} 
578
	elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") {
581
	elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") {
Lines 583-589 sub parse_page($$) { Link Here
583
	    
586
	    
584
	    my $title = ($tag->content_list())[0];
587
	    my $title = ($tag->content_list())[0];
585
588
586
	    $title = convert_cp1252_chars(\$title);
589
	    convert_cp1252_chars(\$title);
587
590
588
	    $title =~ s/\s*\([^\(]+\)\s*$//;
591
	    $title =~ s/\s*\([^\(]+\)\s*$//;
589
	    if ($title =~ s/\s*(\d+)\.\sTeil//gi) {
592
	    if ($title =~ s/\s*(\d+)\.\sTeil//gi) {
Lines 836-841 sub squeeze_out_desc($$) { Link Here
836
    # try to match <category>, <country> <year>; R: <names>; D: <names> construct
839
    # try to match <category>, <country> <year>; R: <names>; D: <names> construct
837
    # where <country>/<year> or the [RD]: stuff may be missing ...
840
    # where <country>/<year> or the [RD]: stuff may be missing ...
838
    if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;0-9]+)(,?\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) {
841
    if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;0-9]+)(,?\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) {
842
        t "split rule: <category>, <country> <year> ...";
839
	$$desc = "";
843
	$$desc = "";
840
844
841
	#-- $parts[1] is the show title in English (doesn't have to be available)
845
	#-- $parts[1] is the show title in English (doesn't have to be available)
Lines 887-895 sub squeeze_out_desc($$) { Link Here
887
	}
891
	}
888
    } 
892
    } 
889
    else {
893
    else {
894
        t "split rule: dot splitting";
890
	my @data = split "·", $$desc;
895
	my @data = split "·", $$desc;
891
	s/(^\s|\s$)//g foreach(@data); #CHG#
896
	s/(^\s|\s$)//g foreach(@data); #CHG#
892
897
898
	for(0 .. (scalar(@data) - 1)) {
899
	    t "dot-split part $_: " . $data[$_];
900
	}
901
893
	if(scalar(@data) == 3 
902
	if(scalar(@data) == 3 
894
	   && not($data[1] =~ m/[\wäöüßÄÖÜ]+:/) #- FIX false positive: tvtoday.de seems to publish "guests: <names>" here some (rare) times :-(  
903
	   && not($data[1] =~ m/[\wäöüßÄÖÜ]+:/) #- FIX false positive: tvtoday.de seems to publish "guests: <names>" here some (rare) times :-(  
895
	   && $data[2] =~ m/^Mit (.*?)$/) {
904
	   && $data[2] =~ m/^Mit (.*?)$/) {
Lines 945-959 sub squeeze_out_desc($$) { Link Here
945
		    next;
954
		    next;
946
		}
955
		}
947
956
948
		if (my ($cat, $rest1, $names, $guests, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?) - Gäste: (..+?)(?:\s+-\s+(.+))?$/) {
957
		if (my ($nocat, $cat, $rest1, $names, $guests, $rest2) = m/^(([^,.%^&*();]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?) - Gäste: (..+?)(?:\s+-\s+(.+))?$/) {
949
		    my @data = split_up_names($names, $show);
958
		    my @data = split_up_names($names, $show);
950
		    push @{$show->{"credits"}{"presenter"}}, @data;
959
		    push @{$show->{"credits"}{"presenter"}}, @data;
951
		    my @guest_data = split_up_names($guests, $show);
960
		    my @guest_data = split_up_names($guests, $show);
952
		    push @{$show->{"credits"}{"guest"}}, @guest_data;
961
		    push @{$show->{"credits"}{"guest"}}, @guest_data;
953
		    $show->{"category"} = [[ $cat, $lang ]];
962
954
		    
963
		    if(defined($cat)) {
955
	            warn "misdetected category: $cat" 
964
			$show->{"category"} = [[ $cat, $lang ]];
956
		      if($cat =~ m/\d{4}/);
965
			
966
			warn "misdetected category: $cat" 
967
			  if($cat =~ m/\d{4}/);
968
		    }
969
		    else {
970
		        t "no-cat match: $nocat";
971
			$rest1 = $nocat;
972
		    }
957
		      
973
		      
958
		    my @rest; 
974
		    my @rest; 
959
		    foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) {
975
		    foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) {
Lines 963-976 sub squeeze_out_desc($$) { Link Here
963
		    next unless length($_);
979
		    next unless length($_);
964
		}
980
		}
965
981
966
		if (my ($cat, $rest1, $names, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) {
982
		if (my ($nocat, $cat, $rest1, $names, $rest2) = m/^(([^,]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) {
967
		    my @data = split_up_names($names, $show);
983
		    my @data = split_up_names($names, $show);
968
		    push @{$show->{"credits"}{"presenter"}}, @data;
984
		    push @{$show->{"credits"}{"presenter"}}, @data;
969
		    $show->{"category"} = [[ $cat, $lang ]];
970
985
971
	            warn "misdetected category: $cat" 
986
		    if(defined($cat)) {
972
		      if($cat =~ m/\d{4}/);
987
			$show->{"category"} = [[ $cat, $lang ]];
973
		    
988
989
			warn "misdetected category: $cat" 
990
			  if($cat =~ m/\d{4}/);
991
		    }
992
		    else {
993
		        t "no-cat match: $nocat";
994
			$rest1 = $nocat;
995
		    }
996
			
974
		    my @rest; 
997
		    my @rest; 
975
		    foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) {
998
		    foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) {
976
			push @rest, $_ if(defined($_) && length($_));
999
			push @rest, $_ if(defined($_) && length($_));

Return to bug 95166