-
大小: 6KB文件類型: .py金幣: 2下載: 1 次發布日期: 2021-06-10
- 語言: Python
- 標簽: word2vec??tensorflow??
資源簡介
word2vec的tensorflow實現,來自黃文堅的“tensorflow實戰”
代碼片段和文件信息
#!/usr/bin/env?python
#*-?coding:?utf-8?-*-
from?__future__?import?print_function
import?os
import?math
import?urllib.request
import?zipfile
import?random
import?collections
import?numpy?as?np
import?tensorflow?as?tf
url?=?‘http://mattmahoney.net/dc/‘
‘‘‘Step1:??download?dataset‘‘‘
def?may_download(filename?expected_bytes):
if?not?os.path.exists(filename):
filename?_?=?urllib.request.urlretrieve(url?+?filename?filename)
statinfo?=?os.stat(filename)
if?statinfo.st_size?==?expected_bytes:
print(‘Found?and?verified‘?filename)
else:
print(statinfo.st_size)
raise?Exception(‘Failed?to?verify?‘?+?filename)
return?filename
filename?=?may_download(‘text8.zip‘?31344016)
‘‘‘Step2:?data?transformation‘‘‘
def?read_data(filename):
with?zipfile.ZipFile(filename)?as?f:
data?=?tf.compat.as_str_any(f.read(f.namelist()[0])).split()
return?data
“““Test“““
words?=?read_data(filename)
print(‘Datas?size‘?len(words))
‘‘‘Step3:?make?dataset‘‘‘
vocabulary_size?=?50000
def?build_dataset(words):
count?=?[[‘UNK‘?-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
dictionary?=?dict()
for?word?_?in?count:
dictionary[word]?=?len(dictionary)
data?=?list()
unk_count?=?0
for?word?in?words:
if?word?in?dictionary:
index?=?dictionary[word]
else:
index?=?0
unk_count?+=?1
data.append(index)
count[0][1]?=?unk_count
reverse_dictionary?=?dict(zip(dictionary.values()?dictionary.keys()))
return?data?count?dictionary?reverse_dictionary
data?count?dictionary?reverse_dictionary?=?build_dataset(words)
del?words
“““Test“““
print(‘Most?common?words?(+UNK)?‘?count[:5])
print(‘Sample?data‘?data[:10]?[reverse_dictionary[i]?for?i?in?data[:10]])
‘‘‘Step4:?generate?training?samples‘‘‘
data_index?=?0
def?generate_batch(batch_size?num_skips?skip_window):
global?data_index
assert?batch_size?%?num_skips?==?0
assert?num_skips?<=?(2?*skip_window)
batch?=?np.ndarray(shape=(batch_size)?dtype=np.int32)
labels?=?np.ndarray(shape=(batch_size?1)?dtype=np.int32)
span?=?2?*?skip_window?+?1
buffer?=?collections.deque(maxlen=span)
for?_?in?range(span):
buffer.append(data[data_index])
data_index?=?(data_index?+?1)?%?len(data)
for?i?in?range(batch_size?//?num_skips):
target?=?skip_window
targets_to_avoid?=?[skip_window]
for?j?in?range(num_skips):
while?target?in?targets_to_avoid:
target?=?random.randint(0?span?-1)
targets_to_avoid.append(target)
batch[i?*?num_skips?+?j]?=?buffer[skip_window]
labels[i?*?num_skips?+?j?0]?=?buffer[target]
buffer.append(data[data_index])
data_index?=?(?data_index?+?1)?%?len(data)
return?batch?labels
“““Test“““
batch?labels?=?generate_batch(batch_size=8?num_skips=2?skip_window=1)
for?i?in?range(8):
print(batch[i]?reverse_dictionary[batch[i]]?‘->‘?labels[i?0]?reverse_dictionary[labels[i?0]])
‘‘‘Step?5:?training‘‘‘
batch_size?=?128
embedding_size?=?128
skip_window?=?128
num_skips?=?2
valid_size?=?16
valid_window?=?100
valid_examples?=?
- 上一篇:python37_d.lib文件
- 下一篇:mnist_normal
評論
共有 條評論