-
大小: 6.08MB文件類型: .zip金幣: 2下載: 0 次發布日期: 2023-09-24
- 語言: Python
- 標簽:
資源簡介
Sequential Event Experiment based on Travel note crawled from XieCheng,基于50W攜程出行游記的采集與順承事件圖譜構建

代碼片段和文件信息
#!/usr/bin/env?python3
#?coding:?utf-8
#?File:?pattern.py
#?Author:?lhy
#?Date:?18-7-15
import?pymongo
import?re
import?jieba
from?sentence_parser?import?*
class?EventGraph:
????def?__init__(self):
????????conn?=?pymongo.MongoClient()
????????self.pattern?=?re.compile(r‘(.*)(其次|然后|接著|隨后|接下來)(.*)‘)
????????self.col?=?conn[‘travel‘][‘doc‘]
????????self.col_insert?=?conn[‘travel‘][‘events‘]
????????self.parse_handler?=?LtpParser()
????‘‘‘長句切分‘‘‘
????def?seg_long_sents(self?content):
????????return?[sentence?for?sentence?in?re.split(r‘[??!!。;;::\n\r….·]‘?content.replace(‘?‘‘‘).replace(‘\u3000‘‘‘))?if?len(sentence)?>?5]
????‘‘‘短句切分‘‘‘
????def?process_subsent(self?content):
????????return?[s?for?s?in?re.split(r‘[、,和與及且跟()~▲.]‘?content)?if?len(s)>1]
????‘‘‘處理數據庫中的文本‘‘‘
????def?process_doc(self):
????????count?=?0
????????for?item?in?self.col.find():
????????????content?=?item[‘content‘]
????????????events_all?=?self.collect_event(content)
????????????if?events_all:
????????????????data?=?{}
????????????????data[‘events‘]?=?events_all
????????????????self.col_insert.insert(data)
????????????else:
????????????????continue
????‘‘‘統計收集EVENT‘‘‘
????def?collect_event(self?content):
????????events_all?=?[]
????????sents=?self.seg_long_sents(content)
????????for?sent?in?sents:
????????????events?=?self.event_extract(sent)
????????????if?events:
????????????????events_all.append(events)
????????return?events_all
????‘‘‘順承事件抽取‘‘‘
????def?event_extract(self?sent):
????????result?=?self.pattern.findall(sent)
????????if?result:
????????????event_seqs?=?[]
????????????for?tmp?in?result:
????????????????pre?=?tmp[0]
????????????????post?=?tmp[2]
????????????????pre_sents?=?self.process_subsent(pre)
????????????????post_sents?=?self.process_subsent(post)
????????????????if?pre_sents?and?post_sents:
????????????????????event_seqs?+=?pre_sents
????????????????????event_seqs?+=?post_sents
????????????????else:
????????????????????continue
????????????‘‘‘對事件進行結構化‘‘‘
????????????if?event_seqs:
????????????????events?=?self.extract_phrase(event_seqs)
????????????????return?events
????????????else:
????????????????pass
????????return?[]
????‘‘‘將一個長句中的句子進行分解,提取出其中的vob短語‘‘‘
????def?extract_phrase(self?event_seqs):
????????events?=?[]
????????for?event?in?event_seqs:
????????????vobs?=?self.vob_exract(event)
????????????if?vobs:
????????????????events?+=?vobs
????????return?events
????‘‘‘提取VOB關系‘‘‘
????def?vob_exract(self?content):
????????vobs?=?[]
????????words?=?list(jieba.cut(content))
????????if?len(words)?>=?300:
????????????return?[]
????????postags?=?self.parse_handler.get_postag(words)
????????tuples?child_dict_list?=?self.parse_handler.parser_main(words?postags)
????????for?tuple?in?tuples:
????????????rel?=?tuple[-1]
????????????pos_verb=?tuple[4][0]
????????????pos_object?=?tuple[2][0]
????????????if?rel?==?‘VOB‘?and?(pos_verb?pos_object)?in?[(‘v‘?‘n‘)?(‘v‘?‘i‘)]:
????????????????phrase?=?‘‘.
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\
?????文件??????????93??2018-12-15?05:16??SequentialEventExtration-master\.gitattributes
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\.idea\
?????文件?????????398??2018-12-15?05:16??SequentialEventExtration-master\.idea\SequentialEventGraph.iml
?????文件?????????706??2018-12-15?05:16??SequentialEventExtration-master\.idea\misc.xm
?????文件?????????292??2018-12-15?05:16??SequentialEventExtration-master\.idea\modules.xm
?????文件?????????180??2018-12-15?05:16??SequentialEventExtration-master\.idea\vcs.xm
?????文件???????20948??2018-12-15?05:16??SequentialEventExtration-master\.idea\workspace.xm
?????文件????????7111??2018-12-15?05:16??SequentialEventExtration-master\README.md
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\event_graph\
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\dist\
?????文件???????30798??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\dist\vis.css
?????文件?????1532584??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\dist\vis.js
?????文件??????781766??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\dist\vis.map
?????文件???????22008??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\dist\vis.min.css
?????文件??????582497??2018-12-15?05:16??SequentialEventExtration-master\event_graph\VIS\dist\vis.min.js
?????文件????????3312??2018-12-15?05:16??SequentialEventExtration-master\event_graph\event_extract.py
?????文件????????4001??2018-12-15?05:16??SequentialEventExtration-master\event_graph\event_graph.py
?????文件????????7007??2018-12-15?05:16??SequentialEventExtration-master\event_graph\sentence_parser.py
?????文件????13643483??2018-12-15?05:16??SequentialEventExtration-master\event_graph\seq_events.txt
?????文件???????55935??2018-12-15?05:16??SequentialEventExtration-master\event_graph\travel_event_graph.html
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\image\
?????文件??????297401??2018-12-15?05:16??SequentialEventExtration-master\image\all.png
?????文件???????54270??2018-12-15?05:16??SequentialEventExtration-master\image\book.png
?????文件???????45649??2018-12-15?05:16??SequentialEventExtration-master\image\food.png
?????文件??????213664??2018-12-15?05:16??SequentialEventExtration-master\image\graph.png
?????文件???????86232??2018-12-15?05:16??SequentialEventExtration-master\image\plane.png
?????文件???????99399??2018-12-15?05:16??SequentialEventExtration-master\image\train.png
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\news_spider\
?????目錄???????????0??2018-12-15?05:16??SequentialEventExtration-master\news_spider\.idea\
............此處省略22個文件信息
評論
共有 條評論