資源簡(jiǎn)介
共8400多對(duì)中英語(yǔ)句,已預(yù)處理,中文用jieba分了詞,標(biāo)點(diǎn)符號(hào)沒(méi)問(wèn)題,保存于en-zh.csv,分隔符是制表符\t(不是默認(rèn)逗號(hào))。
句粒度,但有不少長(zhǎng)句,裁剪后5w對(duì)也夠用。
原始數(shù)據(jù)集也在包中,其中en-zh_News.tmx有一部分句子有問(wèn)題,(en-zh.csv丟棄了有問(wèn)題的句子),如果必要建議不要直接使用該文件。
另外附贈(zèng)我對(duì)語(yǔ)料的預(yù)處理文件(propressor.py),以及數(shù)據(jù)集(pytorch的Dataset)等相關(guān)的實(shí)現(xiàn)(LangData.py)。
如果又可以改善的地方,歡迎留言
代碼片段和文件信息
import?torch
from?torch.utils.data.dataset?import?Dataset
import?torch.utils.data.dataloader
#?import?pandas?as?pd
import?csv
PAD_token?=?0
SOS_token?=?1
EOS_token?=?2
UKN_token?=?3
class?Lang:
????def?__init__(self?name):
????????self.name?=?name
????????self.word2index?=?{}
????????self.word2count?=?{}
????????self.index2word?=?{PAD_token:?“PAD“?SOS_token:?“SOS“?EOS_token:?“EOS“?UKN_token:?‘UKN‘}
????????self.n_words?=?4
????def?add_sentence(self?sentence):
????????if?self.name?==?‘cn‘:
????????????for?word?in?sentence:
????????????????self.add_word(word)
????????else:
????????????for?word?in?sentence.split(‘?‘):
????????????????self.add_word(word)
????def?add_word(self?word):
????????if?word?not?in?self.word2index:
????????????self.word2index[word]?=?self.n_words
????????????self.word2count[word]?=?1
????????????self.index2word[self.n_words]?=?word
????????????self.n_words?+=?1
????????else:
????????????self.word2count[word]?+=?1
????def?trim(self?min_count=2):
????????keep?=?[]
????????for?k?v?in?self.word2count.items():
????????????if?v?>=?min_count:
????????????????keep.append(k)
????????print(‘total‘?len(self.word2index))
????????print(‘keep‘?len(keep))
????????print(‘keep?{:.4%}‘.format(len(keep)?/?len(self.word2index)))
????????self.word2index?=?{}
????????self.word2count?=?{}
????????self.index2word?=?{PAD_token:?“PAD“?SOS_token:?“SOS“?EOS_token:?“EOS“?UKN_token:?‘UKN‘}
????????self.n_words?=?4
????????for?word?in?keep:
????????????self.add_word(word)
#?小寫,修剪和刪除非字符字符
def?normalizeString(s):
????s?=?s.lower().strip()
????#?s?=?re.sub(r“([.!?])“?r“?\1“?s)
????#?s?=?re.sub(r“[^a-zA-Z.!?]+“?r“?“?s)
????#?s?=?re.sub(“\b{2}“?“\b“?s)
????return?s
def?filter_pair(p?min_length?max_length):
????return?len(p[0].split(‘?‘))?<=?max_length?and?\
???????????len(p[1].split(‘?‘))?<=?max_length?and?\
???????????len(p[0].split(‘?‘))?>=?min_length?and?\
???????????len(p[1].split(‘?‘))?>=?min_length
def?filter_pairs(pairs?min_length?max_length):
????return?[pair?for?pair?in?pairs?if?filter_pair(pair?min_length?max_length)]
class?LangDataset(Dataset):
????def?__init__(self?lang1?lang2?root_path?reverse=False?transform=None
?????????????????min_length=1?max_length=None?lower=True?trim_count=None):
????????self.transform?=?transform
????????print(“Reading?lines...“)
????????with?open(root_path?+?‘%s-%s.csv‘?%?(lang1?lang2)?encoding=‘utf-8-sig‘)?as?f:
????????????reader?=?csv.reader(f?delimiter=‘\t‘)
????????????if?lower:
????????????????data?=?[[normalizeString(s)?for?s?in?l]?for?l?in?reader]
????????????else:
????????????????data?=?[l?for?l?in?reader]
????????????f.close()
????????print(“Read?%s?sentence?pairs“?%?len(data))
????????if?min_length?>?1?or?max_length?is?not?None:
????????????data?=?filter_pairs(data?min_length?max_length)
????????self.data?=?data
????????self.min_length?=?min_length
????????self.m
?屬性????????????大小?????日期????時(shí)間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2019-08-11?18:12??data\
?????文件?????1315138??2019-07-21?00:01??data\en-cn.txt
?????文件????11876774??2019-07-21?00:01??data\en-fra.txt
?????文件????20299270??2019-08-06?17:57??data\en-zh.csv
?????文件??????903387??2019-07-31?16:49??data\en-zh_HIT.txt
?????文件????53605933??2019-06-29?13:14??data\en-zh_News.tmx
?????文件?????4678577??2019-06-29?03:31??data\en-zh_News2.tmx
?????文件?????1479544??2019-08-02?21:58??data\en-zh_simple.csv
?????文件????????4951??2019-08-07?22:44??LangData.py
?????文件????????3637??2019-08-06?16:11??preprocessor.py
評(píng)論
共有 條評(píng)論