python

超轻量级php框架startmvc

python正向最大匹配分词和逆向最大匹配分词的实例

更新时间:2020-06-12 03:12:01 作者:startmvc
正向最大匹配#-*-coding:utf-8-*-CODEC='utf-8'defu(s,encoding):'convertedotherencodingtounicodeencoding'ifisinstance

正向最大匹配


# -*- coding:utf-8 -*-
 
CODEC='utf-8'
 
def u(s, encoding):
 'converted other encoding to unicode encoding'
 if isinstance(s, unicode):
 return s
 else:
 return unicode(s, encoding)
 
def fwd_mm_seg(wordDict, maxLen, str):
 'forward max match segment'
 wordList = []
 segStr = str
 segStrLen = len(segStr)
 for word in wordDict:
 print 'word: ', word
 print "\n"
 while segStrLen > 0:
 if segStrLen > maxLen:
 wordLen = maxLen
 else:
 wordLen = segStrLen
 subStr = segStr[0:wordLen]
 print "subStr: ", subStr
 while wordLen > 1:
 if subStr in wordDict:
 print "subStr1: %r" % subStr
 break
 else:
 print "subStr2: %r" % subStr
 wordLen = wordLen - 1
 subStr = subStr[0:wordLen]
# print "subStr3: ", subStr
 wordList.append(subStr)
 segStr = segStr[wordLen:]
 segStrLen = segStrLen - wordLen
 for wordstr in wordList:
 print "wordstr: ", wordstr
 return wordList
 
 
def main():
 fp_dict = open('words.dic')
 wordDict = {}
 for eachWord in fp_dict:
 wordDict[u(eachWord.strip(), 'utf-8')] = 1
 segStr = u'你好世界hello world'
 print segStr
 wordList = fwd_mm_seg(wordDict, 10, segStr)
 print "==".join(wordList)
 
 
if __name__ == '__main__':
 main()
 

逆向最大匹配


# -*- coding:utf-8 -*-
 
 
def u(s, encoding):
 'converted other encoding to unicode encoding'
 if isinstance(s, unicode):
 return s
 else:
 return unicode(s, encoding)
 
CODEC='utf-8'
 
def bwd_mm_seg(wordDict, maxLen, str):
 'forward max match segment'
 wordList = []
 segStr = str
 segStrLen = len(segStr)
 for word in wordDict:
 print 'word: ', word
 print "\n"
 while segStrLen > 0:
 if segStrLen > maxLen:
 wordLen = maxLen
 else:
 wordLen = segStrLen
 subStr = segStr[-wordLen:None]
 print "subStr: ", subStr
 while wordLen > 1:
 if subStr in wordDict:
 print "subStr1: %r" % subStr
 break
 else:
 print "subStr2: %r" % subStr
 wordLen = wordLen - 1
 subStr = subStr[-wordLen:None]
# print "subStr3: ", subStr
 wordList.append(subStr)
 segStr = segStr[0: -wordLen]
 segStrLen = segStrLen - wordLen
 wordList.reverse()
 for wordstr in wordList:
 print "wordstr: ", wordstr
 return wordList
 
 
def main():
 fp_dict = open('words.dic')
 wordDict = {}
 for eachWord in fp_dict:
 wordDict[u(eachWord.strip(), 'utf-8')] = 1
 segStr = ur'你好世界hello world'
 print segStr
 wordList = bwd_mm_seg(wordDict, 10, segStr)
 print "==".join(wordList)
 
if __name__ == '__main__':
 main()
 

以上这篇python正向最大匹配分词和逆向最大匹配分词的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。

python 正向 逆向 最大匹配分词