資源簡介
LSTM數據集+python源碼,實測在Theano環境平臺下可用!詳情見我的博客:http://blog.csdn.net/zhongkelee/article/details/52090352
代碼片段和文件信息
from?__future__?import?print_function
from?six.moves?import?xrange
import?six.moves.cPickle?as?pickle
import?gzip
import?os
import?numpy
import?theano
def?prepare_data(seqs?labels?maxlen=None):
????“““Create?the?matrices?from?the?datasets.
????This?pad?each?sequence?to?the?same?lenght:?the?lenght?of?the
????longuest?sequence?or?maxlen.
????if?maxlen?is?set?we?will?cut?all?sequence?to?this?maximum
????lenght.
????This?swap?the?axis!
????“““
????#?x:?a?list?of?sentences
????lengths?=?[len(s)?for?s?in?seqs]
????if?maxlen?is?not?None:
????????new_seqs?=?[]
????????new_labels?=?[]
????????new_lengths?=?[]
????????for?l?s?y?in?zip(lengths?seqs?labels):
????????????if?l?????????????????new_seqs.append(s)
????????????????new_labels.append(y)
????????????????new_lengths.append(l)
????????lengths?=?new_lengths
????????labels?=?new_labels
????????seqs?=?new_seqs
????????if?len(lengths)?1:
????????????return?None?None?None
????n_samples?=?len(seqs)
????maxlen?=?numpy.max(lengths)
????x?=?numpy.zeros((maxlen?n_samples)).astype(‘int64‘)
????x_mask?=?numpy.zeros((maxlen?n_samples)).astype(theano.config.floatX)
????for?idx?s?in?enumerate(seqs):
????????x[:lengths[idx]?idx]?=?s
????????x_mask[:lengths[idx]?idx]?=?1.
????return?x?x_mask?labels
def?get_dataset_file(dataset?default_dataset?origin):
????‘‘‘Look?for?it?as?if?it?was?a?full?path?if?not?try?local?file
????if?not?try?in?the?data?directory.
????Download?dataset?if?it?is?not?present
????‘‘‘
????data_dir?data_file?=?os.path.split(dataset)
????if?data_dir?==?““?and?not?os.path.isfile(dataset):
????????#?Check?if?dataset?is?in?the?data?directory.
????????new_path?=?os.path.join(
????????????os.path.split(__file__)[0]
????????????“..“
????????????“data“
????????????dataset
????????)
????????if?os.path.isfile(new_path)?or?data_file?==?default_dataset:
????????????dataset?=?new_path
????if?(not?os.path.isfile(dataset))?and?data_file?==?default_dataset:
????????from?six.moves?import?urllib
????????print(‘Downloading?data?from?%s‘?%?origin)
????????urllib.request.urlretrieve(origin?dataset)
????????
????return?dataset
def?load_data(path=“imdb.pkl“?n_words=100000?valid_portion=0.1?maxlen=None
??????????????sort_by_len=True):
????‘‘‘Loads?the?dataset
????:type?path:?String
????:param?path:?The?path?to?the?dataset?(here?IMDB)
????:type?n_words:?int
????:param?n_words:?The?number?of?word?to?keep?in?the?vocabulary.
????????All?extra?words?are?set?to?unknow?(1).
????:type?valid_portion:?float
????:param?valid_portion:?The?proportion?of?the?full?train?set?used?for
????????the?validation?set.
????:type?maxlen:?None?or?positive?int
????:param?maxlen:?the?max?sequence?length?we?use?in?the?train/valid?set.
????:type?sort_by_len:?bool
????:name?sort_by_len:?Sort?by?the?sequence?lenght?for?the?train
????????valid?and?test?set.?This?allow?faster?execution?as?it?cause
????????less?padding?per?minibatch.?Another?mechanism?must?be?used?to
????????shuffle?the?train?set?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????33213513??2016-07-29?20:06??lstm\imdb.pkl
?????文件????????5405??2016-07-29?19:57??lstm\imdb.py
?????文件????????5242??2016-07-29?20:04??lstm\imdb.pyc
?????文件???????22671??2016-07-29?19:50??lstm\lstm.py
?????文件?????5649642??2016-07-29?21:48??lstm\lstm_model.npz
?????文件?????????470??2016-07-29?21:41??lstm\lstm_model.npz.pkl
?????目錄???????????0??2016-07-29?20:36??lstm\
- 上一篇:CNN+pythoncode8.18.zip
- 下一篇:人臉識別Python代碼
評論
共有 條評論