www.gusucode.com > 74cms PHP骑士人才系统 v3.5.1源码程序 > code/upload/include/splitword.class.php
<?php /* * 74cms 中文分词 * ============================================================================ * 版权所有: 骑士网络,并保留所有权利。 * 网站地址: http://www.74cms.com; * ---------------------------------------------------------------------------- * 这不是一个自由软件!您只能在不用于商业目的的前提下对程序代码进行修改和 * 使用;不允许对程序代码以任何形式任何目的的再发布。 * ============================================================================ */ class SPWord { var $maxLen = 5; var $minlen = 2; var $spchar = ' '; var $dicword=array(); function SPWord() { $this->__construct(); } function __construct() { $dicfile = dirname(__FILE__)."/word.txt"; $fp = fopen($dicfile,'r'); while($line = fgets($fp,256)) { $line = trim($line); $this->dicword[strlen($line)][$line]=1; } fclose($fp); } function extracttag($str) { if (empty($str)) { return ''; } $spwords = explode(" ",$this->revisestr($str)); $tag=''; foreach($spwords as $astr) { $tag.=$this->rwhods($astr); } return $tag; } function rwhods($str) { $str=trim($str); $length = strlen(trim($str)); for ($i=0;$i<$length;$i++) { $retstr[]= ord($str[$i]) > 127 ? trim($str[$i].$str[++$i]) : trim($str[$i]); } return $this->matchesword($retstr); } function matchesword($arr,$oldstr='') { if (empty($arr)) { return $oldstr; } $count=count($arr); if ($count>$this->maxLen) { $count=$this->maxLen; } $i=$this->minlen-1; $w=""; for ($c = 0; $c <=$i-1; $c++) { $w.=$arr[$c]; } for ($i=$this->minlen-1; $i <=$count-1; $i++) { $w.=$arr[$i]; if ($this->isword($w)) { $oldstr=$oldstr.$this->spchar.$w; } } if(array_shift($arr)) { return $this->matchesword($arr,$oldstr); } } function revisestr($str) { $str = preg_replace("/[[:punct:]]/i", ' ', $str); $str = str_replace(PHP_EOL, ' ', $str); $str = str_replace(array(',',',', '。','、','!','?','(',')'),' ',$str); return $str; } function pad($str) { if (empty($str)) { return ''; } else { $str=explode(" ",$str); if (is_array($str)) { $str=array_unique($str); $str=array_map(array(__CLASS__,'wordpad'),$str); return implode($this->spchar,$str); } } } function wordpad($str) { if (empty($str)) { return ''; } $leng=strlen($str); if ($leng>=8) { return $str; } else { $l=4-($leng/2); return str_pad($str,$leng+$l,'0'); } } function isword($word) { $slen = strlen($word); if($slen > $this->maxLen*2) { return false; } else { return isset($this->dicword[$slen][$word]); } } } ?>