www.gusucode.com > Carbon Forum PHP轻论坛系统 v3.6.5源码程序 > Carbon-Forum-3.6.5/includes/PHPAnalysis.class.php
<?php /* * 居于Unicode编码词典的php分词器 * 1、只适用于php5,必要函数 iconv * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法 * 3、简单操作流程: SetSource -> StartAnalysis -> Get***Result * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作 * * Copyright IT柏拉图 QQ: 2500875 Email: 2500875#qq.com * * @version 2.0 * */ //常量定义 define('_SP_', chr(0xFF) . chr(0xFE)); define('UCS2', 'ucs-2be'); class PhpAnalysis { //hash算法选项 public $mask_value = 0x000F; //Default:0xFFFF //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型) public $sourceCharSet = 'utf-8'; public $targetCharSet = 'utf-8'; //生成的分词结果数据类型 1 为全部, 2为 词典词汇及单个中日韩简繁字符及英文, 3 为词典词汇及英文 public $resultType = 1; //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1 public $notSplitLen = 5; //把英文单词全部转小写 public $toLower = false; //使用最大切分模式对二元词进行消岐 public $differMax = false; //尝试合并单字 public $unitWord = true; //初始化类时直接加载词典 public static $loadInit = false; //使用热门词优先模式进行消岐 public $differFreq = false; //被转换为unicode的源字符串 private $sourceString = ''; //附加词典 public $addonDic = array(); public $addonDicFile = 'dict/words_addons.dic'; //主词典 public $dicStr = ''; public $mainDic = array(); public $mainDicHand = false; public $mainDicInfos = array(); public $mainDicFile = 'dict/base_dic_full.dic'; //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条) private $isLoadAll = false; //主词典词语最大长度 x / 2 private $dicWordMax = 14; //粗分后的数组(通常是截取句子等用途) private $simpleResult = array(); //最终结果(用空格分开的词汇列表) private $finallyResult = ''; //是否已经载入词典 public $isLoadDic = false; //系统识别或合并的新词 public $newWords = array(); public $foundWordStr = ''; //词库载入时间 public $loadTime = 0; /** * 构造函数 * @param $source_charset * @param $target_charset * @param $load_alldic * @param $source * * @return void */ public function __construct($source_charset = 'utf-8', $target_charset = 'utf-8', $load_all = true, $source = '') { $this->addonDicFile = __DIR__ . '/' . $this->addonDicFile; $this->mainDicFile = __DIR__ . '/' . $this->mainDicFile; $this->SetSource($source, $source_charset, $target_charset); $this->isLoadAll = $load_all; if (self::$loadInit) $this->LoadDict(); } /** * 析构函数 */ function __destruct() { if ($this->mainDicHand !== false) { @fclose($this->mainDicHand); } } /** * 根据字符串计算key索引 * @param $key * @return short int */ private function _get_index($key) { $l = strlen($key); $h = 0x238f13af; while ($l--) { $h += ($h << 5); $h ^= ord($key[$l]); $h &= 0x7fffffff; } return ($h % $this->mask_value); } /** * 从文件获得词 * @param $key * @param $type (类型 word 或 key_groups) * @return short int */ public function GetWordInfos($key, $type = 'word') { if (!$this->mainDicHand) { $this->mainDicHand = fopen($this->mainDicFile, 'r'); } $p = 0; $keynum = $this->_get_index($key); if (isset($this->mainDicInfos[$keynum])) { $data = $this->mainDicInfos[$keynum]; } else { //rewind( $this->mainDicHand ); $move_pos = $keynum * 8; fseek($this->mainDicHand, $move_pos, SEEK_SET); $dat = fread($this->mainDicHand, 8); $arr = unpack('I1s/n1l/n1c', $dat); if ($arr['l'] == 0) { return false; } fseek($this->mainDicHand, $arr['s'], SEEK_SET); $data = @unserialize(fread($this->mainDicHand, $arr['l'])); $this->mainDicInfos[$keynum] = $data; } if (!is_array($data) || !isset($data[$key])) { return false; } return ($type == 'word' ? $data[$key] : $data); } /** * 设置源字符串 * @param $source * @param $source_charset * @param $target_charset * * @return bool */ public function SetSource($source, $source_charset = 'utf-8', $target_charset = 'utf-8') { $this->sourceCharSet = strtolower($source_charset); $this->targetCharSet = strtolower($target_charset); $this->simpleResult = array(); $this->finallyResult = array(); $this->finallyIndex = array(); if ($source != '') { $rs = true; if (preg_match("/^utf/", $source_charset)) { $this->sourceString = iconv('utf-8', UCS2, $source); } else if (preg_match("/^gb/", $source_charset)) { $this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source)); } else if (preg_match("/^big/", $source_charset)) { $this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source)); } else { $rs = false; } } else { $rs = false; } return $rs; } /** * 设置结果类型(只在获取finallyResult才有效) * @param $rstype 1 为全部, 2去除特殊符号 * * @return void */ public function SetResultType($rstype) { $this->resultType = $rstype; } /** * 载入词典 * * @return void */ public function LoadDict($maindic = '') { $startt = microtime(true); //正常读取文件 $dicAddon = $this->addonDicFile; if ($maindic == '' || !file_exists($maindic)) { $dicWords = $this->mainDicFile; } else { $dicWords = $maindic; $this->mainDicFile = $maindic; } //加载主词典(只打开) $this->mainDicHand = fopen($dicWords, 'r'); //载入副词典 $hw = ''; $ds = file($dicAddon); foreach ($ds as $d) { $d = trim($d); if ($d == '') continue; $estr = substr($d, 1, 1); if ($estr == ':') { $hw = substr($d, 0, 1); } else { $spstr = _SP_; $spstr = iconv(UCS2, 'utf-8', $spstr); $ws = explode(',', $d); $wall = iconv('utf-8', UCS2, join($spstr, $ws)); $ws = explode(_SP_, $wall); foreach ($ws as $estr) { $this->addonDic[$hw][$estr] = strlen($estr); } } } $this->loadTime = microtime(true) - $startt; $this->isLoadDic = true; } /** * 检测某个词是否存在 */ public function IsWord($word) { $winfos = $this->GetWordInfos($word); return ($winfos !== false); } /** * 获得某个词的词性及词频信息 * @parem $word unicode编码的词 * @return void */ public function GetWordProperty($word) { if (strlen($word) < 4) { return '/s'; } $infos = $this->GetWordInfos($word); return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s"; } /** * 指定某词的词性信息(通常是新词) * @parem $word unicode编码的词 * @parem $infos array('c' => 词频, 'm' => 词性); * @return void; */ public function SetWordInfos($word, $infos) { if (strlen($word) < 4) { return; } if (isset($this->mainDicInfos[$word])) { $this->newWords[$word]++; $this->mainDicInfos[$word]['c']++; } else { $this->newWords[$word] = 1; $this->mainDicInfos[$word] = $infos; } } /** * 开始执行分析 * @parem bool optimize 是否对结果进行优化 * @return bool */ public function StartAnalysis($optimize = true) { if (!$this->isLoadDic) { $this->LoadDict(); } $this->simpleResult = $this->finallyResult = array(); $this->sourceString .= chr(0) . chr(32); $slen = strlen($this->sourceString); $sbcArr = array(); $j = 0; //全角与半角字符对照表 for ($i = 0xFF00; $i < 0xFF5F; $i++) { $scb = 0x20 + $j; $j++; $sbcArr[$i] = $scb; } //对字符串进行粗分 $onstr = ''; $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 $s = 0; $ansiWordMatch = "[0-9a-z@#%\+\.-]"; $notNumberMatch = "[a-z@#%\+]"; for ($i = 0; $i < $slen; $i++) { $c = $this->sourceString[$i] . $this->sourceString[++$i]; $cn = hexdec(bin2hex($c)); $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn; //ANSI字符 if ($cn < 0x80) { if (preg_match('/' . $ansiWordMatch . '/i', chr($cn))) { if ($lastc != 2 && $onstr != '') { $this->simpleResult[$s]['w'] = $onstr; $this->simpleResult[$s]['t'] = $lastc; $this->_deep_analysis($onstr, $lastc, $s, $optimize); $s++; $onstr = ''; } $lastc = 2; $onstr .= chr(0) . chr($cn); } else { if ($onstr != '') { $this->simpleResult[$s]['w'] = $onstr; if ($lastc == 2) { if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4; } $this->simpleResult[$s]['t'] = $lastc; if ($lastc != 4) $this->_deep_analysis($onstr, $lastc, $s, $optimize); $s++; } $onstr = ''; $lastc = 3; if ($cn < 31) { continue; } else { $this->simpleResult[$s]['w'] = chr(0) . chr($cn); $this->simpleResult[$s]['t'] = 3; $s++; } } } //普通字符 else { //正常文字 if (($cn > 0x3FFF && $cn < 0x9FA6) || ($cn > 0xF8FF && $cn < 0xFA2D) || ($cn > 0xABFF && $cn < 0xD7A4) || ($cn > 0x3040 && $cn < 0x312B)) { if ($lastc != 1 && $onstr != '') { $this->simpleResult[$s]['w'] = $onstr; if ($lastc == 2) { if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4; } $this->simpleResult[$s]['t'] = $lastc; if ($lastc != 4) $this->_deep_analysis($onstr, $lastc, $s, $optimize); $s++; $onstr = ''; } $lastc = 1; $onstr .= $c; } //特殊符号 else { if ($onstr != '') { $this->simpleResult[$s]['w'] = $onstr; if ($lastc == 2) { if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr))) $lastc = 4; } $this->simpleResult[$s]['t'] = $lastc; if ($lastc != 4) $this->_deep_analysis($onstr, $lastc, $s, $optimize); $s++; } //检测书名 if ($cn == 0x300A) { $tmpw = ''; $n = 1; $isok = false; $ew = chr(0x30) . chr(0x0B); while (true) { $w = $this->sourceString[$i + $n] . $this->sourceString[$i + $n + 1]; if ($w == $ew) { $this->simpleResult[$s]['w'] = $c; $this->simpleResult[$s]['t'] = 5; $s++; $this->simpleResult[$s]['w'] = $tmpw; $this->newWords[$tmpw] = 1; if (!isset($this->newWords[$tmpw])) { $this->foundWordStr .= $this->_out_string_encoding($tmpw) . '/nb, '; $this->SetWordInfos($tmpw, array( 'c' => 1, 'm' => 'nb' )); } $this->simpleResult[$s]['t'] = 13; $s++; //最大切分模式对书名继续分词 if ($this->differMax) { $this->simpleResult[$s]['w'] = $tmpw; $this->simpleResult[$s]['t'] = 21; $this->_deep_analysis($tmpw, $lastc, $s, $optimize); $s++; } $this->simpleResult[$s]['w'] = $ew; $this->simpleResult[$s]['t'] = 5; $s++; $i = $i + $n + 1; $isok = true; $onstr = ''; $lastc = 5; break; } else { $n = $n + 2; $tmpw .= $w; if (strlen($tmpw) > 60) { break; } } } //while if (!$isok) { $this->simpleResult[$s]['w'] = $c; $this->simpleResult[$s]['t'] = 5; $s++; $onstr = ''; $lastc = 5; } continue; } $onstr = ''; $lastc = 5; if ($cn == 0x3000) { continue; } else { $this->simpleResult[$s]['w'] = $c; $this->simpleResult[$s]['t'] = 5; $s++; } } //2byte symbol } //end 2byte char } //end for //处理分词后的结果 $this->_sort_finally_result(); } /** * 深入分词 * @parem $str * @parem $ctype (2 英文类, 3 中/韩/日文类) * @parem $spos 当前粗分结果游标 * @return bool */ private function _deep_analysis(&$str, $ctype, $spos, $optimize = true) { //中文句子 if ($ctype == 1) { $slen = strlen($str); //小于系统配置分词要求长度的句子 if ($slen < $this->notSplitLen) { $tmpstr = ''; $lastType = 0; if ($spos > 0) $lastType = $this->simpleResult[$spos - 1]['t']; if ($slen < 5) { //echo iconv(UCS2, 'utf-8', $str).'<br/>'; if ($lastType == 4 && (isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]))) { $str2 = ''; if (!isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)])) { $str2 = substr($str, 2, 2); $str = substr($str, 0, 2); } $ww = $this->simpleResult[$spos - 1]['w'] . $str; $this->simpleResult[$spos - 1]['w'] = $ww; $this->simpleResult[$spos - 1]['t'] = 4; if (!isset($this->newWords[$this->simpleResult[$spos - 1]['w']])) { $this->foundWordStr .= $this->_out_string_encoding($ww) . '/mu, '; $this->SetWordInfos($ww, array( 'c' => 1, 'm' => 'mu' )); } $this->simpleResult[$spos]['w'] = ''; if ($str2 != '') { $this->finallyResult[$spos - 1][] = $ww; $this->finallyResult[$spos - 1][] = $str2; } } else { $this->finallyResult[$spos][] = $str; } } else { $this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize); } } //正常长度的句子,循环进行分词处理 else { $this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize); } } //英文句子,转为小写 else { if ($this->toLower) { $this->finallyResult[$spos][] = strtolower($str); } else { $this->finallyResult[$spos][] = $str; } } } /** * 中文的深入分词 * @parem $str * @return void */ private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = true) { $quote1 = chr(0x20) . chr(0x1C); $tmparr = array(); $hasw = 0; //如果前一个词为 “ , 并且字符串小于3个字符当成一个词处理。 if ($spos > 0 && $slen < 11 && $this->simpleResult[$spos - 1]['w'] == $quote1) { $tmparr[] = $str; if (!isset($this->newWords[$str])) { $this->foundWordStr .= $this->_out_string_encoding($str) . '/nq, '; $this->SetWordInfos($str, array( 'c' => 1, 'm' => 'nq' )); } if (!$this->differMax) { $this->finallyResult[$spos][] = $str; return; } } //进行切分 for ($i = $slen - 1; $i > 0; $i -= 2) { //单个词 $nc = $str[$i - 1] . $str[$i]; //是否已经到最后两个字 if ($i <= 2) { $tmparr[] = $nc; $i = 0; break; } $isok = false; $i = $i + 1; for ($k = $this->dicWordMax; $k > 1; $k = $k - 2) { if ($i < $k) continue; $w = substr($str, $i - $k, $k); if (strlen($w) <= 2) { $i = $i - 1; break; } if ($this->IsWord($w)) { $tmparr[] = $w; $i = $i - $k + 1; $isok = true; break; } } //echo '<hr />'; //没适合词 if (!$isok) $tmparr[] = $nc; } $wcount = count($tmparr); if ($wcount == 0) return; $this->finallyResult[$spos] = array_reverse($tmparr); //优化结果(岐义处理、新词、数词、人名识别等) if ($optimize) { $this->_optimize_result($this->finallyResult[$spos], $spos); } } /** * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等) * @parem $optimize 是否优化合并的结果 * @return bool */ //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符 private function _optimize_result(&$smarr, $spos) { $newarr = array(); $prePos = $spos - 1; $arlen = count($smarr); $i = $j = 0; //检测数量词 if ($prePos > -1 && !isset($this->finallyResult[$prePos])) { $lastw = $this->simpleResult[$prePos]['w']; $lastt = $this->simpleResult[$prePos]['t']; if (($lastt == 4 || isset($this->addonDic['c'][$lastw])) && isset($this->addonDic['u'][$smarr[0]])) { $this->simpleResult[$prePos]['w'] = $lastw . $smarr[0]; $this->simpleResult[$prePos]['t'] = 4; if (!isset($this->newWords[$this->simpleResult[$prePos]['w']])) { $this->foundWordStr .= $this->_out_string_encoding($this->simpleResult[$prePos]['w']) . '/mu, '; $this->SetWordInfos($this->simpleResult[$prePos]['w'], array( 'c' => 1, 'm' => 'mu' )); } $smarr[0] = ''; $i++; } } for (; $i < $arlen; $i++) { if (!isset($smarr[$i + 1])) { $newarr[$j] = $smarr[$i]; break; } $cw = $smarr[$i]; $nw = $smarr[$i + 1]; $ischeck = false; //检测数量词 if (isset($this->addonDic['c'][$cw]) && isset($this->addonDic['u'][$nw])) { //最大切分时保留合并前的词 if ($this->differMax) { $newarr[$j] = chr(0) . chr(0x28); $j++; $newarr[$j] = $cw; $j++; $newarr[$j] = $nw; $j++; $newarr[$j] = chr(0) . chr(0x29); $j++; } $newarr[$j] = $cw . $nw; if (!isset($this->newWords[$newarr[$j]])) { $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/mu, '; $this->SetWordInfos($newarr[$j], array( 'c' => 1, 'm' => 'mu' )); } $j++; $i++; $ischeck = true; } //检测前导词(通常是姓) else if (isset($this->addonDic['n'][$smarr[$i]])) { $is_rs = false; //词语是副词或介词或频率很高的词不作为人名 if (strlen($nw) == 4) { $winfos = $this->GetWordInfos($nw); if (isset($winfos['m']) && ($winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) { $is_rs = true; } } if (!isset($this->addonDic['s'][$nw]) && strlen($nw) < 5 && !$is_rs) { $newarr[$j] = $cw . $nw; //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />"; //尝试检测第三个词 if (strlen($nw) == 2 && isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && !isset($this->addonDic['s'][$smarr[$i + 2]])) { $newarr[$j] .= $smarr[$i + 2]; $i++; } if (!isset($this->newWords[$newarr[$j]])) { $this->SetWordInfos($newarr[$j], array( 'c' => 1, 'm' => 'nr' )); $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/nr, '; } //为了防止错误,保留合并前的姓名 if (strlen($nw) == 4) { $j++; $newarr[$j] = chr(0) . chr(0x28); $j++; $newarr[$j] = $cw; $j++; $newarr[$j] = $nw; $j++; $newarr[$j] = chr(0) . chr(0x29); } $j++; $i++; $ischeck = true; } } //检测后缀词(地名等) else if (isset($this->addonDic['a'][$nw])) { $is_rs = false; //词语是副词或介词不作为前缀 if (strlen($cw) > 2) { $winfos = $this->GetWordInfos($cw); if (isset($winfos['m']) && ($winfos['m'] == 'a' || $winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) { $is_rs = true; } } if (!isset($this->addonDic['s'][$cw]) && !$is_rs) { $newarr[$j] = $cw . $nw; if (!isset($this->newWords[$newarr[$j]])) { $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/na, '; $this->SetWordInfos($newarr[$j], array( 'c' => 1, 'm' => 'na' )); } $i++; $j++; $ischeck = true; } } //新词识别(暂无规则) else if ($this->unitWord) { if (strlen($cw) == 2 && strlen($nw) == 2 && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw]) && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw])) { $newarr[$j] = $cw . $nw; //尝试检测第三个词 if (isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && (isset($this->addonDic['a'][$smarr[$i + 2]]) || isset($this->addonDic['u'][$smarr[$i + 2]]))) { $newarr[$j] .= $smarr[$i + 2]; $i++; } if (!isset($this->newWords[$newarr[$j]])) { $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/ms, '; $this->SetWordInfos($newarr[$j], array( 'c' => 1, 'm' => 'ms' )); } $i++; $j++; $ischeck = true; } } //不符合规则 if (!$ischeck) { $newarr[$j] = $cw; //二元消岐处理——最大切分模式 if ($this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7) { $slen = strlen($nw); $hasDiff = false; for ($y = 2; $y <= $slen - 2; $y = $y + 2) { $nhead = substr($nw, $y - 2, 2); $nfont = $cw . substr($nw, 0, $y - 2); if ($this->IsWord($nfont . $nhead)) { if (strlen($cw) > 2) $j++; $hasDiff = true; $newarr[$j] = $nfont . $nhead; } } } $j++; } } //end for $smarr = $newarr; } /** * 转换最终分词结果到 finallyResult 数组 * @return void */ private function _sort_finally_result() { $newarr = array(); $i = 0; foreach ($this->simpleResult as $k => $v) { if (empty($v['w'])) continue; if (isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0) { foreach ($this->finallyResult[$k] as $w) { if (!empty($w)) { $newarr[$i]['w'] = $w; $newarr[$i]['t'] = 20; $i++; } } } else if ($v['t'] != 21) { $newarr[$i]['w'] = $v['w']; $newarr[$i]['t'] = $v['t']; $i++; } } $this->finallyResult = $newarr; $newarr = ''; } /** * 把uncode字符串转换为输出字符串 * @parem str * return string */ private function _out_string_encoding(&$str) { $rsc = $this->_source_result_charset(); if ($rsc == 1) { $rsstr = iconv(UCS2, 'utf-8', $str); } else if ($rsc == 2) { $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str)); } else { $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str)); } return $rsstr; } /** * 获取最终结果字符串(用空格分开后的分词结果) * @return string */ public function GetFinallyResult($spword = ' ', $word_meanings = false) { $rsstr = ''; foreach ($this->finallyResult as $v) { if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) { continue; } $m = ''; if ($word_meanings) { $m = $this->GetWordProperty($v['w']); } $w = $this->_out_string_encoding($v['w']); if ($w != ' ') { if ($word_meanings) { $rsstr .= $spword . $w . $m; } else { $rsstr .= $spword . $w; } } } return $rsstr; } /** * 获取粗分结果,不包含粗分属性 * @return array() */ public function GetSimpleResult() { $rearr = array(); foreach ($this->simpleResult as $k => $v) { if (empty($v['w'])) continue; $w = $this->_out_string_encoding($v['w']); if ($w != ' ') $rearr[] = $w; } return $rearr; } /** * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符) * @return array() */ public function GetSimpleResultAll() { $rearr = array(); foreach ($this->simpleResult as $k => $v) { $w = $this->_out_string_encoding($v['w']); if ($w != ' ') { $rearr[$k]['w'] = $w; $rearr[$k]['t'] = $v['t']; } } return $rearr; } /** * 获取索引hash数组 * @return array('word'=>count,...) */ public function GetFinallyIndex() { $rearr = array(); foreach ($this->finallyResult as $v) { if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) { continue; } $w = $this->_out_string_encoding($v['w']); if ($w == ' ') { continue; } if (isset($rearr[$w])) { $rearr[$w]++; } else { $rearr[$w] = 1; } } arsort($rearr); return $rearr; } /** * 获取最终关键字(返回用 "," 间隔的关键字) * @return string */ public function GetFinallyKeywords($num = 10) { $n = 0; $arr = $this->GetFinallyIndex(); $okstr = ''; foreach ($arr as $k => $v) { //排除长度为1的词 if (strlen($k) == 1) { continue; } //排除长度为2的非英文词 elseif (strlen($k) == 2 && preg_match('/[^0-9a-zA-Z]/', $k)) { continue; } //排除单个中文字 elseif (strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) { continue; } $okstr .= ($okstr == '' ? $k : ',' . $k); $n++; if ($n > $num) break; } return $okstr; } /** * 获得保存目标编码 * @return int */ private function _source_result_charset() { if (preg_match("/^utf/", $this->targetCharSet)) { $rs = 1; } else if (preg_match("/^gb/", $this->targetCharSet)) { $rs = 2; } else if (preg_match("/^big/", $this->targetCharSet)) { $rs = 3; } else { $rs = 4; } return $rs; } /** * 编译词典 * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt> * 注意, 需要PHP开放足够的内存才能完成操作 * @return void */ public function MakeDict($source_file, $target_file = '') { $target_file = ($target_file == '' ? $this->mainDicFile : $target_file); $allk = array(); $fp = fopen($source_file, 'r'); while ($line = fgets($fp, 64)) { if ($line[0] == '@') continue; list($w, $r, $a) = explode(',', $line); /*if( $line='' ) continue; $w = $line; $r = 1; $a = 'n';*/ $a = trim($a); $w = iconv('utf-8', UCS2, $w); $k = $this->_get_index($w); if (isset($allk[$k])) $allk[$k][$w] = array( $r, $a ); else $allk[$k][$w] = array( $r, $a ); } fclose($fp); $fp = fopen($target_file, 'w'); $heade_rarr = array(); $alldat = ''; $start_pos = $this->mask_value * 8; foreach ($allk as $k => $v) { $dat = serialize($v); $dlen = strlen($dat); $alldat .= $dat; $heade_rarr[$k][0] = $start_pos; $heade_rarr[$k][1] = $dlen; $heade_rarr[$k][2] = count($v); $start_pos += $dlen; } print_r($heade_rarr); unset($allk); for ($i = 0; $i < $this->mask_value; $i++) { if (!isset($heade_rarr[$i])) { $heade_rarr[$i] = array( 0, 0, 0 ); } fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2])); } fwrite($fp, $alldat); fclose($fp); } /** * 导出词典的词条 * @parem $targetfile 保存位置 * @return void */ public function ExportDict($targetfile) { if (!$this->mainDicHand) { $this->mainDicHand = fopen($this->mainDicFile, 'r'); } $fp = fopen($targetfile, 'w'); for ($i = 0; $i <= $this->mask_value; $i++) { $move_pos = $i * 8; fseek($this->mainDicHand, $move_pos, SEEK_SET); $dat = fread($this->mainDicHand, 8); $arr = unpack('I1s/n1l/n1c', $dat); if ($arr['l'] == 0) { continue; } fseek($this->mainDicHand, $arr['s'], SEEK_SET); $data = @unserialize(fread($this->mainDicHand, $arr['l'])); if (!is_array($data)) continue; foreach ($data as $k => $v) { $w = iconv(UCS2, 'utf-8', $k); fwrite($fp, "{$w},{$v[0]},{$v[1]}\n"); } } fclose($fp); return true; } }