91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 26.06MB
    文件類型: .zip
    金幣: 1
    下載: 0 次
    發布日期: 2023-06-28
  • 語言: 其他
  • 標簽: 文本分類??

資源簡介

實現文本分類的主要包括幾個步驟文本分詞處理,特征選擇,特征權重計算,文本特征向量表示,基于訓練文本的特征向量數據訓練SVM模型,對于測試集進行特征向量表示代入訓練得到的svm模型中進行預測分類,達到93%的準確率

資源截圖

代碼片段和文件信息

__author__?=?‘ShadowWalker‘
import?codecs
import?math
import?sys
#?使用開方檢驗選擇特征
#?按UTF-8編碼格式讀取文件

#?定義停止詞
def?ignore(s):
????return?s?==?‘nbsp‘?or?s?==?‘?‘?or?s?==?‘?‘?or?s?==?‘/t‘?or?s?==?‘/n‘?\
???????????or?s?==?‘,‘?or?s?==?‘。‘?or?s?==?‘!‘?or?s?==?‘、‘?or?s?==?‘―‘\
???????????or?s?==?‘?‘??or?s?==?‘@‘?or?s?==?‘:‘?\
???????????or?s?==?‘?!?or?s?==?‘%‘??or?s?==?‘&‘?\
???????????or?s?==?‘(‘?or?s?==?‘)‘?or?s?==?‘《‘?or?s?==?‘》‘?\
???????????or?s?==?‘[‘?or?s?==?‘]‘?or?s?==?‘{‘?or?s?==?‘}‘?\
???????????or?s?==?‘*‘?or?s?==?‘‘?or?s?==?‘.‘??or?s?==?‘&‘?\
???????????or?s?==?‘!‘?or?s?==?‘?‘?or?s?==?‘:‘?or?s?==?‘;‘\
???????????or?s?==?‘-‘?or?s?==?‘&‘\
???????????or?s?==?‘<‘?or?s?==?‘>‘?or?s?==?‘(‘?or?s?==?‘)‘?\
???????????or?s?==?‘[‘?or?s?==?‘]‘?or?s?==?‘{‘?or?s?==?‘}‘?or?s?==?‘nbsp10‘?or?s?==?‘3.6‘?or?s==‘about‘?or?s?==‘there‘?\
???????????or?s?==?“see“?or?s?==?“can“?or?s?==?“U“?or?s?==?“L“?or?s?==?“?“?or?s?==?“in“?or?s?==“;“?or?s?==“a“?or?s?==“0144“\
???????????or?s?==?“\n“?or?s?==?“our“

#?print(stopwords)

#?對卡方檢驗所需的?a?b?c?d?進行計算
#?a:在這個分類下包含這個詞的文檔數量
#?b:不在該分類下包含這個詞的文檔數量
#?c:在這個分類下不包含這個詞的文檔數量
#?d:不在該分類下,且不包含這個詞的文檔數量

#
ClassCode?=?[‘C000007‘?‘C000008‘?‘C000010‘?‘C000013‘?‘C000014‘?‘C000016‘?‘C000020‘?‘C000022‘?‘C000023‘?‘C000024‘]

#?構建每個類別的詞Set

#?分詞后的文件路徑
#?textCutbasePath?=?“G:\\ChineseTextClassify\\SogouCCut\\“
textCutbasePath?=?sys.path[0]?+?“\\SogouCCut\\“
#?構建每個類別的詞向量
def?buildItemSets(classDocCount):
????termDic?=?dict()
????#?每個類別下的文檔集合用list表示?每個set表示一個文檔,整體用一個dict表示
????termClassDic?=?dict()
????for?eachclass?in?ClassCode:
????????currClassPath?=?textCutbasePath+eachclass+“\\“
????????eachClassWordSets?=?set()
????????eachClassWordList?=?list()
????????for?i?in?range(classDocCount):
????????????eachDocPath?=?currClassPath+str(i)+“.cut“
????????????eachFileObj?=?open(eachDocPath?‘r‘)
????????????eachFileContent?=?eachFileObj.read()
????????????eachFileWords?=?eachFileContent.split(“?“)
????????????eachFileSet?=?set()
????????????for?eachword?in?eachFileWords:
????????????????#?判斷是否是停止詞
????????????????stripeachword?=?eachword.strip(“?“)
????????????????if?not?ignore(eachword)?and?len(stripeachword)?>?0:
????????????????????eachFileSet.add(eachword)
????????????????????eachClassWordSets.add(eachword)
????????????eachClassWordList.append(eachFileSet)
????????????#?print(eachFileSet)
????????termDic[eachclass]?=?eachClassWordSets
????????termClassDic[eachclass]?=?eachClassWordList
????return?termDic?termClassDic



#?對得到的兩個詞典進行計算,可以得到a?b?c?d?值
#?K?為每個類別選取的特征個數

#?卡方計算公式
def?ChiCalc(a?b?c?d):
????result?=?float(pow((a*d?-?b*c)?2))?/float((a+c)?*?(a+b)?*?(b+d)?*?(c+d))
????return?result

def?featureSelection(termDic?termClassDic?K):
????termCountDic?=?dict()
????for?key?in?termDic:
????????classWordSets?=?termDic[key]
????????classTermCountDic?=?dict()
????????for?eachword?in?classWordSets:??#?對某個類別下的每一個單詞的?a?b?c?d?進行計算
????????????a?=?0
????????????b?=?0
????????????c?=?0
????????????d?=?0
????????????for?eachclass?in?termClassD

?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\.idea\
?????文件??????????19??2016-07-31?08:48??ChineseTextClassify-master\.idea\.name
?????文件?????????284??2016-07-31?08:48??ChineseTextClassify-master\.idea\ChineseTextClassify.iml
?????文件?????????164??2016-07-31?08:48??ChineseTextClassify-master\.idea\encodings.xml
?????文件?????????215??2016-07-31?08:48??ChineseTextClassify-master\.idea\misc.xml
?????文件?????????290??2016-07-31?08:48??ChineseTextClassify-master\.idea\modules.xml
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\.idea\scopes\
?????文件?????????139??2016-07-31?08:48??ChineseTextClassify-master\.idea\scopes\scope_settings.xml
?????文件?????????164??2016-07-31?08:48??ChineseTextClassify-master\.idea\vcs.xml
?????文件???????34607??2016-07-31?08:48??ChineseTextClassify-master\.idea\workspace.xml
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\
?????文件????????1178??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\ChineseCut.py
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\
?????文件??????341175??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\pku_test.txt
?????文件??????549918??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\pku_test_gold.txt
?????文件??????443710??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\pku_test_result.txt
?????文件?????5887805??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\pku_training.txt
?????文件??????347101??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\pku_training_words.txt
?????文件?????2196634??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\score.txt
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\scripts\
?????文件????????3543??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\scripts\mwseg.pl
?????文件????????7225??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PKU_GB\scripts\score
?????文件????????6926??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\PreHMM.py
?????文件????????6098??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\Viterbi.py
?????文件??????277953??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\emit.txt
?????文件?????????303??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\tran.txt
?????文件???????14092??2016-07-31?08:48??ChineseTextClassify-master\ChineseSegmentation\worddict.txt
?????文件????????5522??2016-07-31?08:48??ChineseTextClassify-master\FeatureSelecion.py
?????文件????????3518??2016-07-31?08:48??ChineseTextClassify-master\FeatureWeight.py
?????目錄???????????0??2016-07-31?08:48??ChineseTextClassify-master\LIBSVM\
............此處省略6047個文件信息

評論

共有 條評論