Mohawk Search ランカーのアルゴリズム公開。 - Perl使いの検索エンジン周辺技術に関するブログ。

use DBI;

my ($dns) = "DBI:mysql:xxx:xxx";
my ($username) = "xxxx";
my ($password) = "xxx";
my ($dbh, $sth);
my (@ary);
my $dbfile   = "spider.dat";
my $outfile = "bunseki-output.csv";
$certaindist = 25;

$input1 = <>;

#### database connection
$dbh = DBI -> connect ($dns, $username, $password)
or &Error("Can not connect to database.");
$dbh->do("SET NAMES sjis");

until ($count >= 10){

##############################################################
$SQL_QUERY=<<__CURSOR_1__;
   select url.url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by RAND() limit 1;
__CURSOR_1__
##############################################################
$cursor = $dbh->prepare( "$SQL_QUERY" );
$cursor->execute; 

while(my @row = $cursor->fetchrow_array) {
	$firsturl = $row[0];
	$description1 = $row[1];
	$rank = $row[2];
}
$similar = 0;
$rankplus =  0.1;

$SQL_QUERY2=<<__CURSOR_2__;
  select url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by url.rank
__CURSOR_2__

$cursor = $dbh->prepare( "$SQL_QUERY2" );
$cursor->execute;  

while(my @rec = $cursor->fetchrow_array) {
	$secondurl = $rec[0];
	$description2 = $rec[1];
	$rank2 = $rec[2];


if($rank2 - $rankplus < 1.9) {
	$similar = 1;
} 
if ($rank2 - $rankplus > 1.9){
	$similar = 0;
}

#calculate levenshtein distance between $description1 and $description2;
$distance = levenshtein($description1, $description2);


if ($distance > $certaindist){
	$result = "far";
}
if ($distance < $certaindist){
	$result = "near";

}
print "distance: $distance similar: $similar near or far: $result\n";


	
	open(OUT,">> rank-beta-output.txt");
	print OUT "\"$firsturl\",\"$secondurl\",\"$rank2\",\"$similar\",\"$distance\",\"$result\"\n";
	close(OUT); 



}


#print "success\n";
$count++;
}

$cursor->finish;
$dbh->disconnect;






sub levenshtein
{
    # $s1 and $s2 are the two strings
    # $len1 and $len2 are their respective lengths
    #
    my ($s1, $s2) = @_;
    my ($len1, $len2) = (length $s1, length $s2);

    # If one of the strings is empty, the distance is the length
    # of the other string
    #
    return $len2 if ($len1 == 0);
    return $len1 if ($len2 == 0);

    my %mat;

    # Init the distance matrix
    #
    # The first row to 0..$len1
    # The first column to 0..$len2
    # The rest to 0
    #
    # The first row and column are initialized so to denote distance
    # from the empty string
    #
    for (my $i = 0; $i <= $len1; ++$i)
    {
        for (my $j = 0; $j <= $len2; ++$j)
        {
            $mat{$i}{$j} = 0;
            $mat{0}{$j} = $j;
        }

        $mat{$i}{0} = $i;
    }

    # Some char-by-char processing is ahead, so prepare
    # array of chars from the strings
    #
    my @ar1 = split(//, $s1);
    my @ar2 = split(//, $s2);

    for (my $i = 1; $i <= $len1; ++$i)
    {
        for (my $j = 1; $j <= $len2; ++$j)
        {
            # Set the cost to 1 iff the ith char of $s1
            # equals the jth of $s2
            # 
            # Denotes a substitution cost. When the char are equal
            # there is no need to substitute, so the cost is 0
            #
            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;

            # Cell $mat{$i}{$j} equals the minimum of:
            #
            # - The cell immediately above plus 1
            # - The cell immediately to the left plus 1
            # - The cell diagonally above and to the left plus the cost
            #
            # We can either insert a new char, delete a char or
            # substitute an existing char (with an associated cost)
            #
            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
                                $mat{$i}{$j-1} + 1,
                                $mat{$i-1}{$j-1} + $cost]);
        }
    }

    # Finally, the Levenshtein distance equals the rightmost bottom cell
    # of the matrix
    #
    # Note that $mat{$x}{$y} denotes the distance between the substrings
    # 1..$x and 1..$y
    #
    return $mat{$len1}{$len2};
}


# minimal element of a list
#
sub min
{
    my @list = @{$_[0]};
    my $min = $list[0];

    foreach my $i (@list)
    {
        $min = $i if ($i < $min);
    }

    return $min;
}

sub rankertext{
	$num = $_[0]; #url
	$num2 = $_[1]; #keywords
	$num3 = $_[2]; #descriptions

	
	return $num + (0.3 * $num2) + $num3;
}

sub countcharacter {
	
	$message = $_[0];
	$dummy1 = "";
	$dummy2 = "";
	$zenkau_su = 0;

	# いったん文字列全体をunpackする。
	$dummy1 = "$message";
	$dummy1 =~ s/([^0-9A-Za-z_ ])/'%'.unpack('H2',$1)/ge;
	$dummy2 = "$dummy1";

	# 全角日本語の場合は『あ=%82%a0』『語=%8c%ea』のようになりますので
	# 正規表現を使い置換処理をして全角日本語文字数をカウントする
	$_ = "$dummy1";
	$zenkau_su = s/%\w\w%\w\w//g;

	# 半角を抽出
	$dummy2 =~ s/%\w\w%\w\w//g;


	if ( $dummy2 ne "" ) { $hankaku_su = length $dummy2 ; }

	return $zenkau_su + $hankaku_su ;

}

use DBI;

my ($dns) = "DBI:mysql:xxx:xxx";
my ($username) = "xxxx";
my ($password) = "xxx";
my ($dbh, $sth);
my (@ary);
my $dbfile   = "spider.dat";
my $outfile = "bunseki-output.csv";
$certaindist = 25;

$input1 = <>;

#### database connection
$dbh = DBI -> connect ($dns, $username, $password)
or &Error("Can not connect to database.");
$dbh->do("SET NAMES sjis");

until ($count >= 10){

##############################################################
$SQL_QUERY=<<__CURSOR_1__;
   select url.url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by RAND() limit 1;
__CURSOR_1__
##############################################################
$cursor = $dbh->prepare( "$SQL_QUERY" );
$cursor->execute; 

while(my @row = $cursor->fetchrow_array) {
	$firsturl = $row[0];
	$description1 = $row[1];
	$rank = $row[2];
}
$similar = 0;
$rankplus =  0.1;

$SQL_QUERY2=<<__CURSOR_2__;
  select url, descriptions, url.rank from keywords inner join url on keywords.id = url.id inner join descriptions on keywords.id = descriptions.id where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by url.rank
__CURSOR_2__

$cursor = $dbh->prepare( "$SQL_QUERY2" );
$cursor->execute;  

while(my @rec = $cursor->fetchrow_array) {
	$secondurl = $rec[0];
	$description2 = $rec[1];
	$rank2 = $rec[2];


if($rank2 - $rankplus < 1.9) {
	$similar = 1;
} 
if ($rank2 - $rankplus > 1.9){
	$similar = 0;
}

#calculate levenshtein distance between $description1 and $description2;
$distance = levenshtein($description1, $description2);


if ($distance > $certaindist){
	$result = "far";
}
if ($distance < $certaindist){
	$result = "near";

}
print "distance: $distance similar: $similar near or far: $result\n";


	
	open(OUT,">> rank-beta-output.txt");
	print OUT "\"$firsturl\",\"$secondurl\",\"$rank2\",\"$similar\",\"$distance\",\"$result\"\n";
	close(OUT); 



}


#print "success\n";
$count++;
}

$cursor->finish;
$dbh->disconnect;






sub levenshtein
{
    # $s1 and $s2 are the two strings
    # $len1 and $len2 are their respective lengths
    #
    my ($s1, $s2) = @_;
    my ($len1, $len2) = (length $s1, length $s2);

    # If one of the strings is empty, the distance is the length
    # of the other string
    #
    return $len2 if ($len1 == 0);
    return $len1 if ($len2 == 0);

    my %mat;

    # Init the distance matrix
    #
    # The first row to 0..$len1
    # The first column to 0..$len2
    # The rest to 0
    #
    # The first row and column are initialized so to denote distance
    # from the empty string
    #
    for (my $i = 0; $i <= $len1; ++$i)
    {
        for (my $j = 0; $j <= $len2; ++$j)
        {
            $mat{$i}{$j} = 0;
            $mat{0}{$j} = $j;
        }

        $mat{$i}{0} = $i;
    }

    # Some char-by-char processing is ahead, so prepare
    # array of chars from the strings
    #
    my @ar1 = split(//, $s1);
    my @ar2 = split(//, $s2);

    for (my $i = 1; $i <= $len1; ++$i)
    {
        for (my $j = 1; $j <= $len2; ++$j)
        {
            # Set the cost to 1 iff the ith char of $s1
            # equals the jth of $s2
            # 
            # Denotes a substitution cost. When the char are equal
            # there is no need to substitute, so the cost is 0
            #
            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;

            # Cell $mat{$i}{$j} equals the minimum of:
            #
            # - The cell immediately above plus 1
            # - The cell immediately to the left plus 1
            # - The cell diagonally above and to the left plus the cost
            #
            # We can either insert a new char, delete a char or
            # substitute an existing char (with an associated cost)
            #
            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
                                $mat{$i}{$j-1} + 1,
                                $mat{$i-1}{$j-1} + $cost]);
        }
    }

    # Finally, the Levenshtein distance equals the rightmost bottom cell
    # of the matrix
    #
    # Note that $mat{$x}{$y} denotes the distance between the substrings
    # 1..$x and 1..$y
    #
    return $mat{$len1}{$len2};
}


# minimal element of a list
#
sub min
{
    my @list = @{$_[0]};
    my $min = $list[0];

    foreach my $i (@list)
    {
        $min = $i if ($i < $min);
    }

    return $min;
}

sub rankertext{
	$num = $_[0]; #url
	$num2 = $_[1]; #keywords
	$num3 = $_[2]; #descriptions

	
	return $num + (0.3 * $num2) + $num3;
}

sub countcharacter {
	
	$message = $_[0];
	$dummy1 = "";
	$dummy2 = "";
	$zenkau_su = 0;

	# いったん文字列全体をunpackする。
	$dummy1 = "$message";
	$dummy1 =~ s/([^0-9A-Za-z_ ])/'%'.unpack('H2',$1)/ge;
	$dummy2 = "$dummy1";

	# 全角日本語の場合は『あ=%82%a0』『語=%8c%ea』のようになりますので
	# 正規表現を使い置換処理をして全角日本語文字数をカウントする
	$_ = "$dummy1";
	$zenkau_su = s/%\w\w%\w\w//g;

	# 半角を抽出
	$dummy2 =~ s/%\w\w%\w\w//g;


	if ( $dummy2 ne "" ) { $hankaku_su = length $dummy2 ; }

	return $zenkau_su + $hankaku_su ;

}