資源簡介
本資源為天津大學社會信息檢索的一項大作業(yè),基于爬取的語料庫,總體實現(xiàn)了三個功能:TFIDF計算,兩句子相似度計算,基于語料庫的搜索引擎。
具體任務如下:
1) TFIDF: 給定用自己名字命名的文件夾,請自己爬取一定數(shù)量的網(wǎng)頁、微博形成語料集合,存入該文件夾;在線狀態(tài)下,對其中的詞語進行TFIDF統(tǒng)計。
2) SIM: 在線狀態(tài)下,從網(wǎng)頁頁面輸入任意兩個句子,求其相似度,包括:內積,余弦及Jaccard三種度量方式;同時,可實現(xiàn)對導入的文件夾語料的tfidf統(tǒng)計。
3)SJet:實現(xiàn)基于向量空間模型(VSM)的搜索引擎。

代碼片段和文件信息
#-*-?coding=utf-8?-*-
from?flask?import?Flaskrender_templaterequestredirecturl_forjsonifysend_from_directoryabort
from?werkzeug.utils?import?secure_filename
import?os
import?sys
reload(sys)?
sys.setdefaultencoding(‘utf-8‘)
app?=?Flask(__name__)
@app.route(‘/‘)
def?home():
return?render_template(‘index.html‘)
@app.route(‘/TFIDF‘?methods=[‘GET‘?‘POST‘])
def?upload_file():
if?request.method==‘GET‘:??
return?render_template(‘TFIDF.html‘)
else:?
f?=?request.files[‘data‘]
tempdir?=?‘uploads/‘?+?secure_filename(f.filename)
f.save(tempdir)
#os.system(“activate?python2.7“)
os.system(“python?Get_TFIDF.py?“?+?tempdir)
return?render_template(‘TFIDF.html‘)+‘
提交成功!‘
@app.route(‘/TFIDF_Result‘?methods=[‘GET‘])
def?download_file():
if?request.method==‘GET‘:
dfilename=“allresults.zip“
if?os.path.isfile(dfilename):
return?send_from_directory(‘‘dfilenameas_attachment=True)
abort(404)
@app.route(‘/SIM‘?methods=[‘GET‘?‘POST‘])
def?input_sentence():
if?request.method==‘GET‘:
return?render_template(‘SIM.html‘)
else:
s1?=?request.form.get(“sentence1“).encode(‘gbk‘)
s2?=?request.form.get(“sentence2“).encode(‘gbk‘)
#os.system(“activate?python2.7“)
#os.system(‘python?Similarity_Compare.py?‘?+?s1?+?‘?‘?+?s2)
simcmd?=?os.popen(‘python?Similarity_Compare.py?‘?+?s1?+?‘?‘?+?s2)
simres?=?simcmd.read()
return?render_template(‘SIM.html‘)+‘計算結果
‘+simres+‘
‘
@app.route(‘/SJet‘?methods=[‘GET‘?‘POST‘])
def?mySJet():
if?request.method==‘GET‘:
return?render_template(‘SJet.html‘)
else:
myinput?=?request.form.get(“userinput“).encode(‘gbk‘)
#os.system(“activate?python2.7“)
#os.system(‘python?Similarity_Compare.py?‘?+?s1?+?‘?‘?+?s2)
simcmd1?=?os.popen(“python?SJet.py?‘“?+?myinput?+?“‘“)
simres1?=?simcmd1.read()
return?render_template(‘SJet.html‘)+‘搜索結果
‘+simres1
@app.route(‘/SJetRes/‘?methods=[‘GET‘])
def?show_res(post_id):
if?request.method==‘GET‘:
dfilename1=“database/articles/article“+str(post_id)+“.txt“
if?os.path.isfile(dfilename1):
return?send_from_directory(‘‘dfilename1as_attachment=True)
abort(404)
if?__name__?==?‘__main__‘:
????app.run(host=‘0.0.0.0‘port=‘6789‘)
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-06-12?02:03??SIRC\
?????文件????????2440??2018-06-12?01:01??SIRC\Get_TFIDF.py
?????文件??????????55??2018-05-21?13:07??SIRC\SIMresult.txt
?????文件????????2447??2018-06-12?01:40??SIRC\SJet.py
?????文件????????2143??2018-05-21?01:34??SIRC\Similarity_Compare.py
?????文件??????238095??2018-05-21?09:59??SIRC\allresults.zip
?????文件????????2302??2018-06-12?01:28??SIRC\app.py
?????文件????????2027??2018-05-20?23:58??SIRC\app.pyc
?????目錄???????????0??2018-05-08?14:18??SIRC\databa
?????目錄???????????0??2018-05-08?21:17??SIRC\databa
?????文件??????????12??2018-05-08?14:15??SIRC\databa
?????文件?????????160??2018-05-08?14:15??SIRC\databa
?????文件?????????106??2018-05-08?14:15??SIRC\databa
?????文件?????????104??2018-05-08?14:15??SIRC\databa
?????文件?????????365??2018-05-08?14:15??SIRC\databa
?????文件?????????417??2018-05-08?14:15??SIRC\databa
?????文件?????????148??2018-05-08?14:15??SIRC\databa
?????文件??????????84??2018-05-08?14:15??SIRC\databa
?????文件?????????284??2018-05-08?14:15??SIRC\databa
?????文件?????????403??2018-05-08?14:15??SIRC\databa
?????文件??????????83??2018-05-08?14:15??SIRC\databa
?????文件?????????130??2018-05-08?14:15??SIRC\databa
?????文件?????????189??2018-05-08?14:15??SIRC\databa
?????文件??????????86??2018-05-08?14:15??SIRC\databa
?????文件??????????91??2018-05-08?14:15??SIRC\databa
?????文件?????????431??2018-05-08?14:15??SIRC\databa
?????文件?????????119??2018-05-08?14:15??SIRC\databa
?????文件?????????439??2018-05-08?14:15??SIRC\databa
?????文件?????????383??2018-05-08?14:15??SIRC\databa
?????文件?????????162??2018-05-08?14:15??SIRC\databa
?????文件?????????415??2018-05-08?14:15??SIRC\databa
............此處省略2257個文件信息
評論
共有 條評論