Mohawk Search ランカーのアルゴリズム公開。

use DBI;

my ($dns) = "DBI:mysql:xxx:xxx";
my ($username) = "xxxx";
my ($password) = "xxx";
my ($dbh, $sth);
my (@ary);
my $dbfile   = "spider.dat";
my $outfile = "bunseki-output.csv";
$certaindist = 25;

$input1 = <>;

#### database connection
$dbh = DBI -> connect ($dns, $username, $password)
or &Error("Can not connect to database.");
$dbh->do("SET NAMES sjis");

until ($count >= 10){

   select url.url, descriptions, url.rank from keywords inner join url on = inner join descriptions on = where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by RAND() limit 1;
$cursor = $dbh->prepare( "$SQL_QUERY" );

while(my @row = $cursor->fetchrow_array) {
	$firsturl = $row[0];
	$description1 = $row[1];
	$rank = $row[2];
$similar = 0;
$rankplus =  0.1;

  select url, descriptions, url.rank from keywords inner join url on = inner join descriptions on = where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by url.rank

$cursor = $dbh->prepare( "$SQL_QUERY2" );

while(my @rec = $cursor->fetchrow_array) {
	$secondurl = $rec[0];
	$description2 = $rec[1];
	$rank2 = $rec[2];

if($rank2 - $rankplus < 1.9) {
	$similar = 1;
if ($rank2 - $rankplus > 1.9){
	$similar = 0;

#calculate levenshtein distance between $description1 and $description2;
$distance = levenshtein($description1, $description2);

if ($distance > $certaindist){
	$result = "far";
if ($distance < $certaindist){
	$result = "near";

print "distance: $distance similar: $similar near or far: $result\n";

	open(OUT,">> rank-beta-output.txt");
	print OUT "\"$firsturl\",\"$secondurl\",\"$rank2\",\"$similar\",\"$distance\",\"$result\"\n";


#print "success\n";


sub levenshtein
    # $s1 and $s2 are the two strings
    # $len1 and $len2 are their respective lengths
    my ($s1, $s2) = @_;
    my ($len1, $len2) = (length $s1, length $s2);

    # If one of the strings is empty, the distance is the length
    # of the other string
    return $len2 if ($len1 == 0);
    return $len1 if ($len2 == 0);

    my %mat;

    # Init the distance matrix
    # The first row to 0..$len1
    # The first column to 0..$len2
    # The rest to 0
    # The first row and column are initialized so to denote distance
    # from the empty string
    for (my $i = 0; $i <= $len1; ++$i)
        for (my $j = 0; $j <= $len2; ++$j)
            $mat{$i}{$j} = 0;
            $mat{0}{$j} = $j;

        $mat{$i}{0} = $i;

    # Some char-by-char processing is ahead, so prepare
    # array of chars from the strings
    my @ar1 = split(//, $s1);
    my @ar2 = split(//, $s2);

    for (my $i = 1; $i <= $len1; ++$i)
        for (my $j = 1; $j <= $len2; ++$j)
            # Set the cost to 1 iff the ith char of $s1
            # equals the jth of $s2
            # Denotes a substitution cost. When the char are equal
            # there is no need to substitute, so the cost is 0
            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;

            # Cell $mat{$i}{$j} equals the minimum of:
            # - The cell immediately above plus 1
            # - The cell immediately to the left plus 1
            # - The cell diagonally above and to the left plus the cost
            # We can either insert a new char, delete a char or
            # substitute an existing char (with an associated cost)
            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
                                $mat{$i}{$j-1} + 1,
                                $mat{$i-1}{$j-1} + $cost]);

    # Finally, the Levenshtein distance equals the rightmost bottom cell
    # of the matrix
    # Note that $mat{$x}{$y} denotes the distance between the substrings
    # 1..$x and 1..$y
    return $mat{$len1}{$len2};

# minimal element of a list
sub min
    my @list = @{$_[0]};
    my $min = $list[0];

    foreach my $i (@list)
        $min = $i if ($i < $min);

    return $min;

sub rankertext{
	$num = $_[0]; #url
	$num2 = $_[1]; #keywords
	$num3 = $_[2]; #descriptions

	return $num + (0.3 * $num2) + $num3;

sub countcharacter {
	$message = $_[0];
	$dummy1 = "";
	$dummy2 = "";
	$zenkau_su = 0;

	# いったん文字列全体をunpackする。
	$dummy1 = "$message";
	$dummy1 =~ s/([^0-9A-Za-z_ ])/'%'.unpack('H2',$1)/ge;
	$dummy2 = "$dummy1";

	# 全角日本語の場合は『あ=%82%a0』『語=%8c%ea』のようになりますので
	# 正規表現を使い置換処理をして全角日本語文字数をカウントする
	$_ = "$dummy1";
	$zenkau_su = s/%\w\w%\w\w//g;

	# 半角を抽出
	$dummy2 =~ s/%\w\w%\w\w//g;

	if ( $dummy2 ne "" ) { $hankaku_su = length $dummy2 ; }

	return $zenkau_su + $hankaku_su ;

use DBI;

my ($dns) = "DBI:mysql:xxx:xxx";
my ($username) = "xxxx";
my ($password) = "xxx";
my ($dbh, $sth);
my (@ary);
my $dbfile   = "spider.dat";
my $outfile = "bunseki-output.csv";
$certaindist = 25;

$input1 = <>;

#### database connection
$dbh = DBI -> connect ($dns, $username, $password)
or &Error("Can not connect to database.");
$dbh->do("SET NAMES sjis");

until ($count >= 10){

   select url.url, descriptions, url.rank from keywords inner join url on = inner join descriptions on = where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by RAND() limit 1;
$cursor = $dbh->prepare( "$SQL_QUERY" );

while(my @row = $cursor->fetchrow_array) {
	$firsturl = $row[0];
	$description1 = $row[1];
	$rank = $row[2];
$similar = 0;
$rankplus =  0.1;

  select url, descriptions, url.rank from keywords inner join url on = inner join descriptions on = where match (keywords.keywords) against ("$input1" IN BOOLEAN MODE) order by url.rank

$cursor = $dbh->prepare( "$SQL_QUERY2" );

while(my @rec = $cursor->fetchrow_array) {
	$secondurl = $rec[0];
	$description2 = $rec[1];
	$rank2 = $rec[2];

if($rank2 - $rankplus < 1.9) {
	$similar = 1;
if ($rank2 - $rankplus > 1.9){
	$similar = 0;

#calculate levenshtein distance between $description1 and $description2;
$distance = levenshtein($description1, $description2);

if ($distance > $certaindist){
	$result = "far";
if ($distance < $certaindist){
	$result = "near";

print "distance: $distance similar: $similar near or far: $result\n";

	open(OUT,">> rank-beta-output.txt");
	print OUT "\"$firsturl\",\"$secondurl\",\"$rank2\",\"$similar\",\"$distance\",\"$result\"\n";


#print "success\n";


sub levenshtein
    # $s1 and $s2 are the two strings
    # $len1 and $len2 are their respective lengths
    my ($s1, $s2) = @_;
    my ($len1, $len2) = (length $s1, length $s2);

    # If one of the strings is empty, the distance is the length
    # of the other string
    return $len2 if ($len1 == 0);
    return $len1 if ($len2 == 0);

    my %mat;

    # Init the distance matrix
    # The first row to 0..$len1
    # The first column to 0..$len2
    # The rest to 0
    # The first row and column are initialized so to denote distance
    # from the empty string
    for (my $i = 0; $i <= $len1; ++$i)
        for (my $j = 0; $j <= $len2; ++$j)
            $mat{$i}{$j} = 0;
            $mat{0}{$j} = $j;

        $mat{$i}{0} = $i;

    # Some char-by-char processing is ahead, so prepare
    # array of chars from the strings
    my @ar1 = split(//, $s1);
    my @ar2 = split(//, $s2);

    for (my $i = 1; $i <= $len1; ++$i)
        for (my $j = 1; $j <= $len2; ++$j)
            # Set the cost to 1 iff the ith char of $s1
            # equals the jth of $s2
            # Denotes a substitution cost. When the char are equal
            # there is no need to substitute, so the cost is 0
            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;

            # Cell $mat{$i}{$j} equals the minimum of:
            # - The cell immediately above plus 1
            # - The cell immediately to the left plus 1
            # - The cell diagonally above and to the left plus the cost
            # We can either insert a new char, delete a char or
            # substitute an existing char (with an associated cost)
            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
                                $mat{$i}{$j-1} + 1,
                                $mat{$i-1}{$j-1} + $cost]);

    # Finally, the Levenshtein distance equals the rightmost bottom cell
    # of the matrix
    # Note that $mat{$x}{$y} denotes the distance between the substrings
    # 1..$x and 1..$y
    return $mat{$len1}{$len2};

# minimal element of a list
sub min
    my @list = @{$_[0]};
    my $min = $list[0];

    foreach my $i (@list)
        $min = $i if ($i < $min);

    return $min;

sub rankertext{
	$num = $_[0]; #url
	$num2 = $_[1]; #keywords
	$num3 = $_[2]; #descriptions

	return $num + (0.3 * $num2) + $num3;

sub countcharacter {
	$message = $_[0];
	$dummy1 = "";
	$dummy2 = "";
	$zenkau_su = 0;

	# いったん文字列全体をunpackする。
	$dummy1 = "$message";
	$dummy1 =~ s/([^0-9A-Za-z_ ])/'%'.unpack('H2',$1)/ge;
	$dummy2 = "$dummy1";

	# 全角日本語の場合は『あ=%82%a0』『語=%8c%ea』のようになりますので
	# 正規表現を使い置換処理をして全角日本語文字数をカウントする
	$_ = "$dummy1";
	$zenkau_su = s/%\w\w%\w\w//g;

	# 半角を抽出
	$dummy2 =~ s/%\w\w%\w\w//g;

	if ( $dummy2 ne "" ) { $hankaku_su = length $dummy2 ; }

	return $zenkau_su + $hankaku_su ;
