Dueling DQN交易,赚还是赔?(2)

点击此处查看最新的网赚项目教程

探索金融和人工智能交叉领域的旅程中,今天,我们深入研究了强化学习(RL)应用于股票交易的世界。书接上回,在前几天的文章《我们在Dueling DQN交易,赚还是赔?》中,我们介绍了RL的基础知识及其在金融市场的应用,现在我们通过实施Dueling Double Deep Q-Networks(Dueling DQN)来进一步加深我们的理解。

先导入必要的库。

import numpy as npimport torchimport torch.nn as nnfrom typing import Iterable, Unionfrom datetime import datetime, timedeltaimport ptanimport pathlibimport argparseimport gym.wrappersimport torch.optim as optimfrom ignite.engine import Engine, Eventsfrom ignite.handlers import ModelCheckpointfrom ignite.metrics import Accuracy, Lossfrom tensorboardX import SummaryWriter
from mymodule import SimpleFFDQN, EpsilonTracker, Actions, preprocess,StocksEnv

贝尔曼方程是动态规划中的关键方程之一,用于计算马尔可夫决策过程(MDP)中状态值函数或状态-动作值函数的期望值。它描述了当前状态的值与未来可能状态的值之间的关系,基于最优策略的长期累积奖励。

在强化学习中,贝尔曼方程被用于估计值函数或Q函数,从而指导智能体学习最优策略。它作为一个关键方程,用于估计状态的值函数或状态-动作对的Q函数,这对于通过Q学习和深度Q网络(DQN)学习最优策略是至关重要的。

()=max⁡((,)+∑′(′∣,)(′))

在上面的公式里:

Part 1讨论了()和(,),其中()代表值函数,(,)代表状态下采取动作后的奖励。(′∣,)是从状态经过采取动作后转移到状态′的概率。而是将未来奖励折现到即时奖励的折现因子。

def calc_loss(batch, net, tgt_net, gamma, device="cpu"):    states, actions, rewards, dones, next_states = unpack_batch(batch)    states_v = torch.tensor(states).to(device)    next_states_v = torch.tensor(next_states).to(device)    actions_v = torch.tensor(actions).to(device)    rewards_v = torch.tensor(rewards).to(device)    if dones is not None:        done_mask = torch.BoolTensor(dones.astype(bool)).to(device)    else:        done_mask =torch.BoolTensor(np.array([0],dtype =np.uint8))

state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1) # get action performed in next state . i will take max score next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net.target_model(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1) next_state_values[done_mask] = 0.0 # The Bellman equation is used to compute the expected Q-values for the current state-action pairs. expected_state_action_values = next_state_values.detach() * gamma + rewards_v # calculate loss between state_action_values and expected_state_action_values return nn.MSELoss()(state_action_values, expected_state_action_values)

BATCH_SIZE = 32BARS_COUNT = 10EPS_START = 1.0EPS_FINAL = 0.1EPS_STEPS = 1000000GAMMA = 0.99REPLAY_SIZE = 100000REPLAY_INITIAL = 10000REWARD_STEPS = 2LEARNING_RATE = 0.0001STATES_TO_EVALUATE = 1000#这些参数包括批量大小、探索参数、折现因子等,它们显著影响了我们模型的学习动态writer = SummaryWriter(log_dir='logs')
train_path = "ch08-small-quotesYNDX_150101_151231.csv"val_path = "ch08-small-quotesYNDX_150101_151231.csv"

我们实验的核心在于训练循环,我们在其中协调学习最优交易策略的过程。通过定义批量生成器和训练批量函数,我们利用贝尔曼方程损失来优化我们的神经网络模型,这是根据环境中观察到的转换更新Q值的关键组成部分。

tp = preprocess(train_path)vp = preprocess(val_path)
# Creating environmentsenv = StocksEnv(tp, bars_count=10, commission=0.1, reset_on_close=True, state_1d=False, random_ofs_on_reset=True, reward_on_close=False, volumes=True)env_val = StocksEnv(vp, bars_count=10, commission=0.1, reset_on_close=True, state_1d=False, random_ofs_on_reset=True, reward_on_close=False, volumes=True)
# Device setupdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initializing neural network modelsnet = SimpleFFDQN(env.observation_space.shape[0], env.action_space.n).to(device)tgt_net = ptan.agent.TargetNet(net)
# Initializing action selector and epsilon trackerselector = ptan.actions.EpsilonGreedyActionSelector(EPS_START)eps_tracker = EpsilonTracker(selector, EPS_START, EPS_FINAL, EPS_STEPS)
# Initializing DQN agentagent = ptan.agent.DQNAgent(net, selector, device=device)
# Initializing experience sourceexp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, GAMMA, steps_count=REWARD_STEPS)
# Initializing optimizeroptimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
# Initializing Engine for trainingtrainer = Engine(train_batch)

设置exp_source生成器,并设置训练和验证函数。

# Define batch generator functiondef batch_generator(buffer: ptan.experience.ExperienceReplayBuffer, initial: int, batch_size: int):    buffer.populate(initial)    while True:        buffer.populate(1)        yield buffer.sample(batch_size)
# Define training batch functiondef train_batch(engine, batch): optimizer.zero_grad() loss_v = calc_loss(batch=batch, net=net, tgt_net=tgt_net, gamma=GAMMA ** REWARD_STEPS, device=device) loss_v.backward() optimizer.step() eps_tracker.frame(engine.state.iteration) if getattr(engine.state, "eval_states", None) is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] engine.state.eval_states = np.array(eval_states, copy=False) writer.add_scalar("training/loss", loss_v, engine.state.epoch) return {"loss": loss_v.item(), "epsilon": selector.epsilon}
# Validation functiondef validation_run(env, net, episodes=100, device="cpu", epsilon=0.02, commission=0.1): stats = {metric: [] for metric in METRICS}
for episode in range(episodes): obs = env.reset() total_reward = 0.0 position = None position_steps = None episode_steps = 0
while True: obs_v = torch.tensor([obs]).to(device) out_v = net(obs_v) action_idx = out_v.max(dim=1)[1].item() if np.random.random() < epsilon: action_idx = env.action_space.sample() action = Actions(action_idx) close_price = env._state._cur_close()
if action == Actions.Buy and position is None: position = close_price position_steps = 0 elif action == Actions.Close and position is not None: profit = close_price - position - (close_price + position) * commission / 100 profit = 100.0 * profit / position stats['order_profits'].append(profit) stats['order_steps'].append(position_steps) position = None position_steps = None
obs, reward, done, _ = env.step(action_idx) total_reward += reward episode_steps += 1 if position_steps is not None: position_steps += 1 if done: if position is not None: profit = close_price - position - (close_price + position) * commission / 100 profit = 100.0 * profit / position stats['order_profits'].append(profit) stats['order_steps'].append(position_steps) break
stats['episode_reward'].append(total_reward) stats['episode_steps'].append(episode_steps)
return {key: np.mean(vals) for key, vals in stats.items()}

设置Ignite处理程序并启动训练。

@trainer.on(Events.COMPLETED | Events.EPOCH_COMPLETED(every=10))def log_training_results(engine): if engine.state.epoch % 10 == 0: res = validation_run(env_val, net, episodes=100, device="cpu", epsilon=0.02, commission=0.1) for key, value in res.items(): writer.add_scalar("Agent Metrics", key, value)@trainer.on(Events.ITERATION_COMPLETED)def log_something(engine): out_dict = engine.state.output for key, value in out_dict.items(): if value is None: value = 0.0 elif isinstance(value, torch.Tensor): value = value.item() writer.add_scalar(f"Iteration Metrics{engine.state.epoch}/{key}", value, engine.state.iteration)
checkpoint_handler = ModelCheckpoint(dirname='saved_models', filename_prefix='checkpoint', n_saved=2, require_empty=False)trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'model': net})
trainer.run(batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE), max_epochs=100)writer.close()torch.save(net.state_dict(), 'model_state_dict.pth')res = validation_run(env_val, net, episodes=100, device="cpu", epsilon=0.02, commission=0.1)print(res)

我们将快速运行模型、训练模型并测试日志。

checkpoint_handler = ModelCheckpoint(dirname='saved_models', filename_prefix='checkpoint', n_saved=2, require_empty=False)trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'model': net})trainer.run(batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE),max_epochs=100)writer.close() torch.save(net.state_dict(), 'model_state_dict.pth')res=validation_run(env_val, net, episodes=100, device="cpu", epsilon=0.02, comission=0.1)print(res)


在一台具有8GB内存的GPU上(具体来说,是一个T2-medium实例)训练模型时,需要12小时才能完成。代理需要大约30,000个额外的周期才能收敛并开始展示令人满意的性能。至于这强化学习是不是真的能赚钱?当然,小海豹是希望朋友们发财的!

———END———
限 时 特 惠: 本站每日持续更新海量各大内部创业教程,一年会员只需98元,全站资源免费下载 点击查看详情
站 长 微 信: qs62318888

主题授权提示:请在后台主题设置-主题授权-激活主题的正版授权,授权购买:RiTheme官网

发表回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注