91av视频/亚洲h视频/操亚洲美女/外国一级黄色毛片 - 国产三级三级三级三级

  • 大小:
    文件類型: .py
    金幣: 1
    下載: 0 次
    發布日期: 2021-06-03
  • 語言: Python
  • 標簽: word2vec??代碼??

資源簡介

TensorFlow實戰中實現word2vec代碼(含中文注釋)

資源截圖

代碼片段和文件信息

#!/usr/bin/env?python3
#?-*-?coding:utf-8?-*-

#%%
#?Copyright?2016?The?TensorFlow?Authors.?All?Rights?Reserved.
#
#?Licensed?under?the?Apache?License?Version?2.0?(the?“License“);
#?you?may?not?use?this?file?except?in?compliance?with?the?License.
#?You?may?obtain?a?copy?of?the?License?at
#
#?????http://www.apache.org/licenses/LICENSE-2.0
#
#?Unless?required?by?applicable?law?or?agreed?to?in?writing?software
#?distributed?under?the?License?is?distributed?on?an?“AS?IS“?BASIS
#?WITHOUT?WARRANTIES?OR?CONDITIONS?OF?ANY?KIND?either?express?or?implied.
#?See?the?License?for?the?specific?language?governing?permissions?and
#?limitations?under?the?License.
#?==============================================================================
import?collections
import?math
import?os
import?random
import?zipfile

import?numpy?as?np
import?urllib
import?tensorflow?as?tf

#?Step?1:?Download?the?data.
#?步驟一:?下載數據
url?=?‘http://mattmahoney.net/dc/‘

def?maybe_download(filename?expected_bytes):
??“““Download?a?file?if?not?present?and?make?sure?it‘s?the?right?size.“““
??#?如果不存在該文件的話就下載該文件,并確保它的大小正確
??if?not?os.path.exists(filename):
????filename?_?=?urllib.request.urlretrieve(url?+?filename?filename)
??statinfo?=?os.stat(filename)
??if?statinfo.st_size?==?expected_bytes:
????print(‘Found?and?verified‘?filename)
??else:
????print(statinfo.st_size)
????raise?Exception(
????????‘Failed?to?verify?‘?+?filename?+?‘.?Can?you?get?to?it?with?a?browser?‘)
??return?filename

filename?=?maybe_download(‘text8.zip‘?31344016)

#?Read?the?data?into?a?list?of?strings.
#?把數據讀取進一個字符串的列表
def?read_data(filename):
??“““Extract?the?first?file?enclosed?in?a?zip?file?as?a?list?of?words“““
??with?zipfile.ZipFile(filename)?as?f:
????data?=?tf.compat.as_str(f.read(f.namelist()[0])).split()
??return?data

words?=?read_data(filename)
print(‘Data?size‘?len(words))

#?Step?2:?Build?the?dictionary?and?replace?rare?words?with?UNK?token.
#?步驟二:?構建一個詞典,并把稀有詞語用‘UNK‘代替
vocabulary_size?=?50000

def?build_dataset(words):
??#?得到一個單詞->詞頻的列表取詞頻最高的49999個
??count?=?[[‘UNK‘?-1]]
??count.extend(collections.Counter(words).most_common(vocabulary_size?-?1))
??#print(count[0])
??#print(count[1])
??#print(count[49999])
??‘‘‘
??i?=?0
??for?word_?in?count:
??????if?word?==?‘UNK‘:
??????????i?=?i?+?1
??print(‘UNK‘?i?‘個‘)
??‘‘‘
??#?得到一個單詞->編號的詞典
??dictionary?=?dict()
??#print(len(dictionary))
??for?word?_?in?count:
????dictionary[word]?=?len(dictionary)
??#print(len(dictionary))
??#?將全部單詞轉為編號,并統計UNK的詞頻
??data?=?list()
??unk_count?=?0
??for?word?in?words:
????if?word?in?dictionary:
??????index?=?dictionary[word]
????else:
??????index?=?0??#?dictionary[‘UNK‘]
??????unk_count?+=?1
????data.append(index)
??#?將UNK的詞頻賦值
??count[0][1]?=?unk_count
??#?得到一個編號->單詞的詞典
??reverse_dictionary?=?dict(zip(dictionary.values()?dictionary.keys()))
??#?返回轉換后的編碼列表、每個單詞的頻數統計、單詞->編號的詞典、編號->單詞的詞典
??return?data?count?dictionary?reverse_dictionary

data?count?dictionary?reverse_dictionary?=?build_dataset(words)
#print(len(count

評論

共有 條評論