-
大小: 6KB文件類型: .py金幣: 1下載: 1 次發(fā)布日期: 2021-06-07
- 語言: Python
- 標(biāo)簽:
資源簡介
基于Tensorflow實(shí)現(xiàn)的PPO算法,依賴庫:tensorflow-1.4及以上,gym
代碼片段和文件信息
import?tensorflow?as?tf
import?numpy?as?np
import?gym
import?copy
class?PPO:
????def?__init__(self?n_features?n_actions):
????????self.n_actions?=?n_actions
????????self.n_features?=?n_features
????????self.learning_rate?=?0.0015
????????self.sess?=?tf.Session()
????????self.observe?=?tf.placeholder(tf.float32?[None?self.n_features])
????????self.v?self.act_prob?self.params?=?self._build_net(‘pi‘?train=True)
????????_?self.act_prob_old?self.params_old?=?self._build_net(‘old_pi‘?train=False)
????????self._get_loss()
????????self.sess.run(tf.global_variables_initializer())
????def?_build_net(self?name?train):
????????with?tf.variable_scope(name):
????????????initer?=?tf.initializers.truncated_normal(0.0?0.1)
????????????hidden?=?tf.layers.dense(self.observe?20?tf.nn.tanh?trainable=train)
????????????hidden?=?tf.layers.dense(hidden?20?tf.nn.tanh?trainable=train)
????????????v?=?tf.layers.dense(hidden?1?activation=None?trainable=train)
????????????hidden1?=?tf.layers.dense(self.observe?20?tf.nn.tanh?trainable=train)
????????????hidden1?=?tf.layers.dense(hidden1?20?tf.nn.tanh?trainable=train)
????????????hidden1?=?tf.layers.dense(hidden1?self.n_actions?tf.nn.tanh?trainable=train)
????????????act_prob?=?tf.layers.dense(hidden1?self.n_actions?tf.nn.softmax?trainable=train)
????????params?=?tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES?scope=name)
????????return?v?act_prob?params
????def?_get_loss(self):
????????self.adv?=?tf.placeholder(tf.float32?[None])
????????self.v_next?=?tf.placeholder(tf.float32?[None])
????????self.action?=?tf.placeholder(tf.int32?[None])
????????self.reward?=?tf.placeholder(tf.float32?[None])
????????td_error?=?self.reward?+?0.95*self.v_next?-?self.v
????????v_loss?=?tf.reduce_mean(tf.square(td_error))
????????act_encode?=?tf.one_hot(self.action?self.n_actions)
????????prob?=?tf.reduce_sum(self.act_prob*act_encode?axis=1)
????????prob_old?=?tf.reduce_sum(self.act_prob_old*act_encode?axis=1)
????????ratio?=?tf.exp(tf.log(tf.clip_by_value(prob?1e-10?1.0))?-?tf.log(tf.clip_by_value(prob_old?1e-10?1.0)))
????????clip_ratio?=?tf.clip_by_value(ratio?1.0-0.2?1.0+0.2)
????????clip_loss?=?tf.reduce_mean(tf.minimum(ratio*self.adv?clip_ratio*self.adv))
????????entroy_loss?=?-tf.reduce_mean(tf.reduce_sum(self.act_prob*tf.log(tf.clip_by_value(self.act_prob?1e-10?1.0))?axis=1))
????????self.total_loss?=?clip_loss?-?v_loss?+?0.01*entroy_loss
????????learning_rate?=?tf.train.exponential_decay(0.0015?0?200?0.95)
????????self.train_op?=?tf.train.AdamOptimizer(learning_rate).minimize(-self.total_loss)
????????self.old_pi_update?=?[tf.assign(t?e)?for?t?e?in?zip(self.params_old?self.params)]
????def?learn(self?observe?v_pred?adv?reward?act):
????????loss?_?=?self.sess.run([self.total_loss?self.train_op]
??????????????????????feed_dict={self.observe:?observe?self.v_next:?v_pred
?????
評(píng)論
共有 條評(píng)論