use DBI;
my ($dns) = "DBI:mysql:xxx:xxx";
my ($username) = "xxxx";
my ($password) = "xxx";
my ($dbh, $sth);
my (@ary);
my $dbfile = "spider.dat";
my $outfile = "bunseki-output.csv";
$certaindist = 25;
$input1 = <>;
#### database connection
$dbh = DBI -> connect ($dns, $username, $password)
or &Error("Can not connect to database.");
$dbh->do("SET NAMES sjis");
until ($count >= 10){
##############################################################
$SQL_QUERY=<<__CURSOR_1__;
select url.url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by RAND() limit 1;
__CURSOR_1__
##############################################################
$cursor = $dbh->prepare( "$SQL_QUERY" );
$cursor->execute;
while(my @row = $cursor->fetchrow_array) {
$firsturl = $row[0];
$description1 = $row[1];
$rank = $row[2];
}
$similar = 0;
$rankplus = 0.1;
$SQL_QUERY2=<<__CURSOR_2__;
select url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by url.rank
__CURSOR_2__
$cursor = $dbh->prepare( "$SQL_QUERY2" );
$cursor->execute;
while(my @rec = $cursor->fetchrow_array) {
$secondurl = $rec[0];
$description2 = $rec[1];
$rank2 = $rec[2];
if($rank2 - $rankplus < 1.9) {
$similar = 1;
}
if ($rank2 - $rankplus > 1.9){
$similar = 0;
}
#calculate levenshtein distance between $description1 and $description2;
$distance = levenshtein($description1, $description2);
if ($distance > $certaindist){
$result = "far";
}
if ($distance < $certaindist){
$result = "near";
}
print "distance: $distance similar: $similar near or far: $result\n";
open(OUT,">> rank-beta-output.txt");
print OUT "\"$firsturl\",\"$secondurl\",\"$rank2\",\"$similar\",\"$distance\",\"$result\"\n";
close(OUT);
}
#print "success\n";
$count++;
}
$cursor->finish;
$dbh->disconnect;
sub levenshtein
{
# $s1 and $s2 are the two strings
# $len1 and $len2 are their respective lengths
#
my ($s1, $s2) = @_;
my ($len1, $len2) = (length $s1, length $s2);
# If one of the strings is empty, the distance is the length
# of the other string
#
return $len2 if ($len1 == 0);
return $len1 if ($len2 == 0);
my %mat;
# Init the distance matrix
#
# The first row to 0..$len1
# The first column to 0..$len2
# The rest to 0
#
# The first row and column are initialized so to denote distance
# from the empty string
#
for (my $i = 0; $i <= $len1; ++$i)
{
for (my $j = 0; $j <= $len2; ++$j)
{
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
}
$mat{$i}{0} = $i;
}
# Some char-by-char processing is ahead, so prepare
# array of chars from the strings
#
my @ar1 = split(//, $s1);
my @ar2 = split(//, $s2);
for (my $i = 1; $i <= $len1; ++$i)
{
for (my $j = 1; $j <= $len2; ++$j)
{
# Set the cost to 1 iff the ith char of $s1
# equals the jth of $s2
#
# Denotes a substitution cost. When the char are equal
# there is no need to substitute, so the cost is 0
#
my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
# Cell $mat{$i}{$j} equals the minimum of:
#
# - The cell immediately above plus 1
# - The cell immediately to the left plus 1
# - The cell diagonally above and to the left plus the cost
#
# We can either insert a new char, delete a char or
# substitute an existing char (with an associated cost)
#
$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
}
}
# Finally, the Levenshtein distance equals the rightmost bottom cell
# of the matrix
#
# Note that $mat{$x}{$y} denotes the distance between the substrings
# 1..$x and 1..$y
#
return $mat{$len1}{$len2};
}
# minimal element of a list
#
sub min
{
my @list = @{$_[0]};
my $min = $list[0];
foreach my $i (@list)
{
$min = $i if ($i < $min);
}
return $min;
}
sub rankertext{
$num = $_[0]; #url
$num2 = $_[1]; #keywords
$num3 = $_[2]; #descriptions
return $num + (0.3 * $num2) + $num3;
}
sub countcharacter {
$message = $_[0];
$dummy1 = "";
$dummy2 = "";
$zenkau_su = 0;
# いったん文字列全体をunpackする。
$dummy1 = "$message";
$dummy1 =~ s/([^0-9A-Za-z_ ])/'%'.unpack('H2',$1)/ge;
$dummy2 = "$dummy1";
# 全角日本語の場合は『あ=%82%a0』『語=%8c%ea』のようになりますので
# 正規表現を使い置換処理をして全角日本語文字数をカウントする
$_ = "$dummy1";
$zenkau_su = s/%\w\w%\w\w//g;
# 半角を抽出
$dummy2 =~ s/%\w\w%\w\w//g;
if ( $dummy2 ne "" ) { $hankaku_su = length $dummy2 ; }
return $zenkau_su + $hankaku_su ;
}
use DBI;
my ($dns) = "DBI:mysql:xxx:xxx";
my ($username) = "xxxx";
my ($password) = "xxx";
my ($dbh, $sth);
my (@ary);
my $dbfile = "spider.dat";
my $outfile = "bunseki-output.csv";
$certaindist = 25;
$input1 = <>;
#### database connection
$dbh = DBI -> connect ($dns, $username, $password)
or &Error("Can not connect to database.");
$dbh->do("SET NAMES sjis");
until ($count >= 10){
##############################################################
$SQL_QUERY=<<__CURSOR_1__;
select url.url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by RAND() limit 1;
__CURSOR_1__
##############################################################
$cursor = $dbh->prepare( "$SQL_QUERY" );
$cursor->execute;
while(my @row = $cursor->fetchrow_array) {
$firsturl = $row[0];
$description1 = $row[1];
$rank = $row[2];
}
$similar = 0;
$rankplus = 0.1;
$SQL_QUERY2=<<__CURSOR_2__;
select url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by url.rank
__CURSOR_2__
$cursor = $dbh->prepare( "$SQL_QUERY2" );
$cursor->execute;
while(my @rec = $cursor->fetchrow_array) {
$secondurl = $rec[0];
$description2 = $rec[1];
$rank2 = $rec[2];
if($rank2 - $rankplus < 1.9) {
$similar = 1;
}
if ($rank2 - $rankplus > 1.9){
$similar = 0;
}
#calculate levenshtein distance between $description1 and $description2;
$distance = levenshtein($description1, $description2);
if ($distance > $certaindist){
$result = "far";
}
if ($distance < $certaindist){
$result = "near";
}
print "distance: $distance similar: $similar near or far: $result\n";
open(OUT,">> rank-beta-output.txt");
print OUT "\"$firsturl\",\"$secondurl\",\"$rank2\",\"$similar\",\"$distance\",\"$result\"\n";
close(OUT);
}
#print "success\n";
$count++;
}
$cursor->finish;
$dbh->disconnect;
sub levenshtein
{
# $s1 and $s2 are the two strings
# $len1 and $len2 are their respective lengths
#
my ($s1, $s2) = @_;
my ($len1, $len2) = (length $s1, length $s2);
# If one of the strings is empty, the distance is the length
# of the other string
#
return $len2 if ($len1 == 0);
return $len1 if ($len2 == 0);
my %mat;
# Init the distance matrix
#
# The first row to 0..$len1
# The first column to 0..$len2
# The rest to 0
#
# The first row and column are initialized so to denote distance
# from the empty string
#
for (my $i = 0; $i <= $len1; ++$i)
{
for (my $j = 0; $j <= $len2; ++$j)
{
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
}
$mat{$i}{0} = $i;
}
# Some char-by-char processing is ahead, so prepare
# array of chars from the strings
#
my @ar1 = split(//, $s1);
my @ar2 = split(//, $s2);
for (my $i = 1; $i <= $len1; ++$i)
{
for (my $j = 1; $j <= $len2; ++$j)
{
# Set the cost to 1 iff the ith char of $s1
# equals the jth of $s2
#
# Denotes a substitution cost. When the char are equal
# there is no need to substitute, so the cost is 0
#
my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
# Cell $mat{$i}{$j} equals the minimum of:
#
# - The cell immediately above plus 1
# - The cell immediately to the left plus 1
# - The cell diagonally above and to the left plus the cost
#
# We can either insert a new char, delete a char or
# substitute an existing char (with an associated cost)
#
$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
}
}
# Finally, the Levenshtein distance equals the rightmost bottom cell
# of the matrix
#
# Note that $mat{$x}{$y} denotes the distance between the substrings
# 1..$x and 1..$y
#
return $mat{$len1}{$len2};
}
# minimal element of a list
#
sub min
{
my @list = @{$_[0]};
my $min = $list[0];
foreach my $i (@list)
{
$min = $i if ($i < $min);
}
return $min;
}
sub rankertext{
$num = $_[0]; #url
$num2 = $_[1]; #keywords
$num3 = $_[2]; #descriptions
return $num + (0.3 * $num2) + $num3;
}
sub countcharacter {
$message = $_[0];
$dummy1 = "";
$dummy2 = "";
$zenkau_su = 0;
# いったん文字列全体をunpackする。
$dummy1 = "$message";
$dummy1 =~ s/([^0-9A-Za-z_ ])/'%'.unpack('H2',$1)/ge;
$dummy2 = "$dummy1";
# 全角日本語の場合は『あ=%82%a0』『語=%8c%ea』のようになりますので
# 正規表現を使い置換処理をして全角日本語文字数をカウントする
$_ = "$dummy1";
$zenkau_su = s/%\w\w%\w\w//g;
# 半角を抽出
$dummy2 =~ s/%\w\w%\w\w//g;
if ( $dummy2 ne "" ) { $hankaku_su = length $dummy2 ; }
return $zenkau_su + $hankaku_su ;
}