91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小: 6KB
    文件類型: .py
    金幣: 2
    下載: 1 次
    發布日期: 2021-06-10
  • 語言: Python
  • 標簽: word2vec??tensorflow??

資源簡介

word2vec的tensorflow實現,來自黃文堅的“tensorflow實戰”

資源截圖

代碼片段和文件信息

#!/usr/bin/env?python
#*-?coding:?utf-8?-*-
from?__future__?import?print_function
import?os
import?math
import?urllib.request
import?zipfile
import?random
import?collections
import?numpy?as?np
import?tensorflow?as?tf
url?=?‘http://mattmahoney.net/dc/‘

‘‘‘Step1:??download?dataset‘‘‘
def?may_download(filename?expected_bytes):
if?not?os.path.exists(filename):
filename?_?=?urllib.request.urlretrieve(url?+?filename?filename)
statinfo?=?os.stat(filename)
if?statinfo.st_size?==?expected_bytes:
print(‘Found?and?verified‘?filename)
else:
print(statinfo.st_size)
raise?Exception(‘Failed?to?verify?‘?+?filename)
return?filename

filename?=?may_download(‘text8.zip‘?31344016)

‘‘‘Step2:?data?transformation‘‘‘
def?read_data(filename):
with?zipfile.ZipFile(filename)?as?f:
data?=?tf.compat.as_str_any(f.read(f.namelist()[0])).split()
return?data

“““Test“““
words?=?read_data(filename)
print(‘Datas?size‘?len(words))

‘‘‘Step3:?make?dataset‘‘‘
vocabulary_size?=?50000

def?build_dataset(words):
count?=?[[‘UNK‘?-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
dictionary?=?dict()
for?word?_?in?count:
dictionary[word]?=?len(dictionary)
data?=?list()
unk_count?=?0
for?word?in?words:
if?word?in?dictionary:
index?=?dictionary[word]
else:
index?=?0
unk_count?+=?1
data.append(index)
count[0][1]?=?unk_count
reverse_dictionary?=?dict(zip(dictionary.values()?dictionary.keys()))

return?data?count?dictionary?reverse_dictionary

data?count?dictionary?reverse_dictionary?=?build_dataset(words)
del?words

“““Test“““
print(‘Most?common?words?(+UNK)?‘?count[:5])
print(‘Sample?data‘?data[:10]?[reverse_dictionary[i]?for?i?in?data[:10]])

‘‘‘Step4:?generate?training?samples‘‘‘
data_index?=?0

def?generate_batch(batch_size?num_skips?skip_window):
global?data_index
assert?batch_size?%?num_skips?==?0
assert?num_skips?<=?(2?*skip_window)
batch?=?np.ndarray(shape=(batch_size)?dtype=np.int32)
labels?=?np.ndarray(shape=(batch_size?1)?dtype=np.int32)
span?=?2?*?skip_window?+?1
buffer?=?collections.deque(maxlen=span)

for?_?in?range(span):
buffer.append(data[data_index])
data_index?=?(data_index?+?1)?%?len(data)
for?i?in?range(batch_size?//?num_skips):
target?=?skip_window
targets_to_avoid?=?[skip_window]
for?j?in?range(num_skips):
while?target?in?targets_to_avoid:
target?=?random.randint(0?span?-1)
targets_to_avoid.append(target)

batch[i?*?num_skips?+?j]?=?buffer[skip_window]
labels[i?*?num_skips?+?j?0]?=?buffer[target]
buffer.append(data[data_index])
data_index?=?(?data_index?+?1)?%?len(data)

return?batch?labels

“““Test“““
batch?labels?=?generate_batch(batch_size=8?num_skips=2?skip_window=1)
for?i?in?range(8):
print(batch[i]?reverse_dictionary[batch[i]]?‘->‘?labels[i?0]?reverse_dictionary[labels[i?0]])

‘‘‘Step?5:?training‘‘‘
batch_size?=?128
embedding_size?=?128
skip_window?=?128
num_skips?=?2
valid_size?=?16
valid_window?=?100
valid_examples?=?

評論

共有 條評論