用PHP和xapian构建全文检索[转]

大约从07年起,本博客就不转载了;
这篇算是以译文发的,原文在:http://www.contentwithstyle.co.uk/content/searching-with-xapian-and-php
========邪恶的分割线============
有的时候呢,嗯 ,mysql 就是不够快;尤其是在做全文检索的时候.各个字段都得正确地检索才行,而当我们的各个字段带有不同的权重时,事情就马上变得特别复杂了,这时你就需要xapian来救急了.
Xapian是什么东东
xapian是一个全文检索库,就和lucene和sphinx一样;它需要从c++代码编译,比较底层;现在已经有直接可用的php,perl,python绑定可以用了.目前提供了redhat和ubuntu的包;你可以在Mac os上编译,还可以通过cygwin来在windows下运行.
示例脚本
我不想去解释why和how,我只想展示一个简单的脚本;我封装的php文件有点大,读者可以从下载;
db.sql

CREATE DATABASE `demo`;

CREATE TABLE `demo`.`demo` (
`id` INT( 10 ) UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY ,
`unique_key` VARCHAR( 255 ) NOT NULL ,
`name` VARCHAR( 255 ) NULL DEFAULT NULL ,
`summary` TEXT NULL DEFAULT NULL ,
`date` DATETIME NULL DEFAULT NULL ,
UNIQUE (`unique_key`));

INSERT INTO `demo`.`demo`
(`id`, `unique_key`, `name`, `summary`, `date`)
VALUES (NULL, ‘foo’, ‘foo’, ‘foo bar test’, ‘2008-11-05 00:00:00’),
(NULL , ‘bar’, ‘bar’, ‘test foo bar’, ‘2009-11-05 00:00:00’);

XapianWrapper.php

xapian_read_db = new XapianDatabase(self::SETTINGS_XAPIAN_DB);
$this->xapian_stemmer = new XapianStem(“english”);
$this->xapian_enquire = new XapianEnquire($this->xapian_read_db);
} catch(Exception $e) {
throw new Exception(‘Could initialize Xapian: ‘ . $e->getMessage());
}
}

private function xapian_init_writable() {
try{
$this->xapian_write_db = new XapianWritableDatabase(self::SETTINGS_XAPIAN_DB, Xapian::DB_CREATE_OR_OPEN);
$this->xapian_indexer = new XapianTermGenerator();
$this->xapian_stemmer = new XapianStem(“english”);
$this->xapian_indexer->set_stemmer($this->xapian_stemmer);
} catch(Exception $e) {
throw new Exception(‘Could initialize Xapian: ‘ . $e->getMessage());
}
}

private function mysql_init() {
$this->mysql_link = mysql_connect(self::SETTINGS_MYSQL_HOST, self::SETTINGS_MYSQL_USER, self::SETTINGS_MYSQL_PASS);
if (!$this->mysql_link) {
throw new Exception(‘Could not connect: ‘ . mysql_error());
}

$db_selected = mysql_select_db(self::SETTINGS_MYSQL_DB, $this->mysql_link);
if (!$db_selected) {
throw new Exception(‘Can’t use db : ‘ . mysql_error());
}
}

/**
* Index method
*
*/
public function index($params) {
$this->xapian_init_writable();
$this->mysql_init();

$start = microtime(true);

$response = new stdClass();
$response->indexed = array();

$offset = (isset($params[‘offset’])) ? intval($params[‘offset’]) : 0;
$count = (isset($params[‘count’])) ? intval($params[‘count’]) : self::DEFAULT_COUNT;
$sql = ‘SELECT * FROM ‘.self::SETTINGS_MYSQL_TABLE.’ LIMIT ‘ . $offset . ‘, ‘ . $count . ‘;’;

$result = mysql_query($sql);

if (!$result) {
throw new Exception(‘Invalid query: ‘ . mysql_error());
}

$this->xapian_write_db->begin_transaction();

while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) {
$response->indexed[] = $this->index_row($row);
}

$this->xapian_write_db->commit_transaction();
mysql_free_result($result);
mysql_close($this->mysql_link);

return $response;
}

private function index_row($row) {
$doc = new XapianDocument();

$this->xapian_indexer->set_document($doc);
$this->xapian_indexer->index_text($row[‘name’],50);
$this->xapian_indexer->index_text($row[‘summary’], 1);

$GUID = self::XAPIAN_PREFIX_UID . $row[‘unique_key’];
$doc->add_term($GUID);

$doc->add_value(self::XAPIAN_FIELD_URL, $row[‘url’]);
$doc->add_value(self::XAPIAN_FIELD_DATE, date(‘Ymd’, strtotime($row[‘date’])));
$doc->add_value(self::XAPIAN_FIELD_UID, $row[‘unique_key’]);
$doc->add_value(self::XAPIAN_FIELD_NAME, $row[‘name’]);
$doc->add_value(self::XAPIAN_FIELD_SUMMARY, $row[‘summary’]);

$this->xapian_write_db->replace_document(strval($GUID), $doc);

$row_response = array();
$row_response[‘name’] = $row[‘name’];
$row_response[‘guid’] = $row[‘unique_key’];
$row_response[‘url’] = $row[‘url’];
return $row_response;
}

/**
* Delete method
*
*/
public function delete($params) {
$this->xapian_init_writable();

$this->xapian_write_db->begin_transaction();

$response = array();

foreach($params[‘items’] as $param_guid) {
$GUID = self::XAPIAN_PREFIX_UID . $param_guid;
$this->xapian_write_db->delete_document(strval($GUID));
$response[] = $param_guid;
}

$this->xapian_write_db->commit_transaction();
return $response;
}

/**
* Search method
*
*/
public function search($params) {
$this->xapian_init_readonly();

$start = microtime(true);

// queries array to later construct full query
$arr_queries = array();

// from date
if(!empty($params[‘date_from’])) {
$arr_queries[] = new XapianQuery(XapianQuery::OP_VALUE_GE, 6, date(‘Ymd’, strtotime($params[‘date_from’])));
}

// to date
if(!empty($params[‘date_to’])) {
$arr_queries[] = new XapianQuery(XapianQuery::OP_VALUE_LE, 6, date(‘Ymd’, strtotime($params[‘date_to’])));
}

// unique key
if(!empty($params[‘unique_key’])) {
$arr_queries[] = new XapianQuery(self::XAPIAN_PREFIX_UID . $params[‘unique_key’]);
}

// normal search query parsed
if(!empty($params[‘search’])) {
$qp = new XapianQueryParser();
$qp->set_stemmer($this->xapian_stemmer);
$qp->set_database($this->xapian_read_db);
$qp->set_stemming_strategy(XapianQueryParser::STEM_SOME);
$arr_queries[] = $qp->parse_query($params[‘search’]);
}

// Find the results for the query.
// construct final query
$query = array_pop($arr_queries);

foreach($arr_queries as $sq) {
$query = new XapianQuery(XapianQuery::OP_AND, $query, $sq);
}
$this->xapian_enquire->set_query($query);

// set the count to the specified params
$offset = (isset($params[‘offset’])) ? intval($params[‘offset’]) : 0;
$count = (isset($params[‘count’])) ? intval($params[‘count’]) : self::DEFAULT_COUNT;
$matches = $this->xapian_enquire->get_mset($offset, $count);

$response = new stdClass();
$response->result_count = $matches->get_matches_estimated();
$results = array();

$i = $matches->begin();
while (!$i->equals($matches->end())) {
$m = array();

$n = $i->get_rank() + 1;
$doc = $i->get_document();

$m[‘position’] = $n;
$m[‘url’] = $doc->get_value(self::XAPIAN_FIELD_URL);
$m[‘name’] = $doc->get_value(self::XAPIAN_FIELD_NAME);
$m[‘summary’] = $doc->get_value(self::XAPIAN_FIELD_SUMMARY);
$m[‘date’] = $doc->get_value(self::XAPIAN_FIELD_DATE);
$m[‘unique_key’] = $doc->get_value(self::XAPIAN_FIELD_UID);
$m[‘percent’] = $i->get_percent();

$results[count($results)] = $m;
$i->next();
}

$response->results = $results;
$end = microtime(true);

// runtime info
$response->execute = new stdClass();
$response->execute->call = ‘search’;
$response->execute->offset = $offset;
$response->execute->count = $count;
$response->execute->start = $start;
$response->execute->end = $end;
$response->execute->time = $end – $start;

// debug stuff
$response->execute->debug = $query->get_description();

return $response;
}
}

index.php

index(array());
print_r($res);

Search.php

‘foo’);
$res = $x->search($params);
print_r($res);

delete.php

array(‘foo’),
);
$res = $x->delete($params);
print_r($res);


使用示例:
您下载刚才的源码包后,就可以导入db.sql,并在命令里运行程序;

bash$ php index.php
stdClass Object
(
[indexed] => Array
(
[0] => Array
(
[name] => foo
[guid] => foo
[url] =>
)

[1] => Array
(
[name] => bar
[guid] => bar
[url] =>
)

)

)
bash$ php search.php
stdClass Object
(
[result_count] => 2
[results] => Array
(
[0] => Array
(
[position] => 1
[url] =>
[name] => foo
[summary] => foo bar test
[date] => 20081105
[unique_key] => foo
[percent] => 100
)

[1] => Array
(
[position] => 2
[url] =>
[name] => bar
[summary] => test foo bar
[date] => 20091105
[unique_key] => bar
[percent] => 50
)

)

[execute] => stdClass Object
(
[call] => search
[offset] => 0
[count] => 10
[start] => 1256674866.79
[end] => 1256674866.79
[time] => 0.000944852828979
[debug] => Xapian::Query(Zfoo:(pos=1))
)

)
bash$ php delete.php
Array
(
[0] => foo
)
bash$ php search.php
stdClass Object
(
[result_count] => 1
[results] => Array
(
[0] => Array
(
[position] => 1
[url] =>
[name] => bar
[summary] => test foo bar
[date] => 20091105
[unique_key] => bar
[percent] => 100
)

)

[execute] => stdClass Object
(
[call] => search
[offset] => 0
[count] => 10
[start] => 1256674876.02
[end] => 1256674876.02
[time] => 0.000872850418091
[debug] => Xapian::Query(Zfoo:(pos=1))
)

)

接下来,扩展您自己的程序来满足您的各种需求吧,欢迎反馈.好好地享受检索的乐趣吧.

tokyo cabinet的替代产品:kyoto cabinet

说起tokyocabinet,大家应该都知道,知名的key-value对数据库,memcache的替代产品,tokyotyrant和flare都是用的tokyocabinet来做的底层存储,嗯;tokyocabinet是作者对他之前的qdbm的一个升华.

作者果然”很猛很持久”,最近新推出了kyoto cabinet,是用C++写的,嗯,作者终于学会C++了(嗯,我是调侃一下,作者之前的所有作品都是用纯C写的,我很欣赏;现在用C++,可不是个好兆头!);

之前的tokyo cabinet意思好像是”东京柜子”,不知道为什么作者这么怀念东京的小柜子呢…刚出炉的这个kyoto cabinet,呃,名字似乎就是京都小柜子….

来自评论采集机器人:tokyo cabinet的意思是,东京内阁,kyoto cabinet 的意思是,京都内阁, 而tokyo tyrant的意思是…..东京暴君;

长话短说,拿kyoto cabinet和tokyo cabinet比较一下:(来源:作者pdf说明文档)

空间效率更高(数据库尺寸更小)

并发性能:在 多线程情况下性能更好(使用了CAS等原子操作)

可移植性:不再需要posix的依赖(也就是说,windows上应该也能跑了)

可用性:面向对象的编程(当然,用上C++了嘛)

鲁棒性(通俗一点:健壮性):自动的事务和回滚

另外:

kyoto cabinet 依赖于现代的C++实现,并且在线程实现多了锁的开销.

项目地址:http://www.1978th.net/kyotocabinet/

cloudapi.info 预谋提供的下一个云接口:垃圾评论判别

http://cloudapi.info/ 本来是个很小很小的应用,是我给小宝宝的礼物,web站点我也就随手用rails搭起来,放上去才几天,没想到马上就被垃圾信息注册机器人给盯上了.于是我决定,给cloudapi.info 提供的下一个接口即是垃圾信息判别.

一些想法:

cloudapi 提供垃圾评论判别接口的想法
1.首先得建IP黑名单库,邮箱黑名单库,网址黑名单库,User-agent黑名单库,只要在黑名单库里,立刻杀无赦;
2.将 评论包含的网址,评论作者,评论内容等放入bayes中学习;
3:提供如下API:
其中$comment变量均是Hash(PHP中的数组也是hash),比如:
$comment=array(
“word”=>”…..我们都是好同志”,
“ip”=>”127.0.0.1”,
“email”=>”aaa@example.com”,
“author”=>”一米六二”,
“url”=>”http://www.162cm.com/”,
“ua”=>”Mozilla/Firefox(3.6)”
);
//将当前评论记录为垃圾评论,让系统学习;
api_learn_spam($database,$comment)
//将当前评论记录为非垃圾评论,让系统学习;
api_learn_ham($database,$comment)
//跟上两个相反的操作;
api_unlearn_spam($database,$comment)
api_unlearn_ham($database,$comment)
//分别将ip,url,某个单词和某个特定的UserAgent加入黑名单
api_push_ip_blacklist($database,$ip)
api_push_url_blacklist($database,$url)
api_push_word_blacklist($database,$word)
api_push_ua_blacklist($database,$ua)
//上面四个的反向操作;
api_unpush_ip_blacklist($database,$ip)
api_unpush_url_blacklist($database,$url)
api_unpush_word_blacklist($database,$word)
api_unpush_ua_blacklist($database,$ua)
//让系统来判定当前评论是不是垃圾评论;
api_classify($database,$comment);

cloudapi 提供垃圾评论判别接口的想法

1.首先得建IP黑名单库,邮箱黑名单库,网址黑名单库,User-agent黑名单库,只要在黑名单库里,立刻杀无赦;

2.将 评论包含的网址,评论作者,评论内容等放入bayes中学习;

3:提供如下API:

其中$comment变量均是Hash(PHP中的数组也是hash),比如:

$comment=array(

“word”=>”…..我们都是好同志”,

“ip”=>”127.0.0.1”,

“email”=>”aaa@example.com”,

“author”=>”一米六二”,

“url”=>”http://www.162cm.com/”,

“ua”=>”Mozilla/Firefox(3.6)”

);

//将当前评论记录为垃圾评论,让系统学习;

api_learn_spam($database,$comment)

//将当前评论记录为非垃圾评论,让系统学习;

api_learn_ham($database,$comment)

//跟上两个相反的操作;

api_unlearn_spam($database,$comment)

api_unlearn_ham($database,$comment)

//分别将ip,url,某个单词和某个特定的UserAgent加入黑名单

api_push_ip_blacklist($database,$ip)

api_push_url_blacklist($database,$url)

api_push_word_blacklist($database,$word)

api_push_ua_blacklist($database,$ua)

//上面四个的反向操作;

api_unpush_ip_blacklist($database,$ip)

api_unpush_url_blacklist($database,$url)

api_unpush_word_blacklist($database,$word)

api_unpush_ua_blacklist($database,$ua)

//让系统来判定当前评论是不是垃圾评论;

api_classify($database,$comment);

基于cloudapi.info的wordpress 相关文章插件

为了向大家展示cloudapi.info是怎样提供服务的,写了一个wordpress相关文章插件;在部分文章上相关文章找得还是很准确的,大家不仿试一试.
项目地址在:
http://github.com/xurenlu/wpra2/
源代码下载:
http://github.com/xurenlu/wpra2/archives/master
使用方法:
解压后传到wordpress安装目录的wp-content/plugins/目录下,在wordpress后台选项中插件菜单下启用即可;启用后在第一个主菜单下会新境一个相关文章的选项;如果您懒,不进去填也可以,那样的话您使用的是默认的搜索资料库,您的博文后面的相关文章可能会显示其他博主的文章;当然您的博文也可能显示在别的博客上;我们建议您自行申请一个cloudapi.info的搜索资料库,自己玩一下!

提示:

  1. 这个插件是在文章展示时计算相关文章,并缓存在数据库中;
  2. 插件新建了一个表,叫{prefix}ra_table;
  3. 插件数据并不是一成不变的,缓存是7天过期;
  4. 缓存不是主动过期的,而是有更新时才会真正删除过期数据;