#!/usr/bin/perl use strict; use warnings; use LWP::Simple; my $url = 'http://private.ceek.jp/archives/002031.html'; if (defined $url) { my $html = get($url); print &get_entry_body($html, $url); } exit 0; sub get_entry_body { my ($html, $url) = @_; my $res; my $threshold_len = 100; my $max = 0; # HTML ヘッダを除去 $html =~ s|^.*?<\s*/\s*head\s*>||is; # ページ内リンク if ($url =~ /#(.+)$/) { my $name = $1; $html =~ s|^.*?<\s*a\s[^<>]+name\s*=\s*["']?$name["']?\s*[^<>]*?>||is; } # 切り出し正規表現 my $split_regex = '(?:<\s*/?\s*(?:td|div)[^<>]*?>)'; foreach my $val (split(/$split_regex/io, $html)) { $val = &trim_strip_tags($val); # print "$val\n--------------------\n"; if (length($val) > $threshold_len) { my $punctuation_num = scalar @{[ $val =~ /(?:、|。|,|.|!|?)/g ]}; if ($punctuation_num > $max) { $max = $punctuation_num; $res = $val; } } } return $res; } sub trim_strip_tags { my ($str) = @_; if (defined $str) { $str =~ s/<[^<>]+>//g; # タグ除去 $str =~ s/<[^<>]+>//g; # などを除去 $str =~ s/^\s+//; $str =~ s/\s+$//; } return $str; }