Dueling DQN交易,赚还是赔?(2)
点击此处查看最新的网赚项目教程
探索金融和人工智能交叉领域的旅程中,今天,我们深入研究了强化学习(RL)应用于股票交易的世界。书接上回,在前几天的文章《我们在Dueling DQN交易,赚还是赔?》中,我们介绍了RL的基础知识及其在金融市场的应用,现在我们通过实施Dueling Double Deep Q-Networks(Dueling DQN)来进一步加深我们的理解。
先导入必要的库。
import numpy as np
import torch
import torch.nn as nn
from typing import Iterable, Union
from datetime import datetime, timedelta
import ptan
import pathlib
import argparse
import gym.wrappers
import torch.optim as optim
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss
from tensorboardX import SummaryWriter
from mymodule import SimpleFFDQN, EpsilonTracker, Actions, preprocess,StocksEnv
贝尔曼方程是动态规划中的关键方程之一,用于计算马尔可夫决策过程(MDP)中状态值函数或状态-动作值函数的期望值。它描述了当前状态的值与未来可能状态的值之间的关系,基于最优策略的长期累积奖励。
在强化学习中,贝尔曼方程被用于估计值函数或Q函数,从而指导智能体学习最优策略。它作为一个关键方程,用于估计状态的值函数或状态-动作对的Q函数,这对于通过Q学习和深度Q网络(DQN)学习最优策略是至关重要的。
()=max((,)+∑′(′∣,)(′))
在上面的公式里:
Part 1讨论了()和(,),其中()代表值函数,(,)代表状态下采取动作后的奖励。(′∣,)是从状态经过采取动作后转移到状态′的概率。而是将未来奖励折现到即时奖励的折现因子。
def calc_loss(batch, net, tgt_net, gamma, device="cpu"):
actions, rewards, dones, next_states = unpack_batch(batch)
states_v = torch.tensor(states).to(device)
next_states_v = torch.tensor(next_states).to(device)
actions_v = torch.tensor(actions).to(device)
rewards_v = torch.tensor(rewards).to(device)
if dones is not None:
done_mask = torch.BoolTensor(dones.astype(bool)).to(device)
else:
done_mask =torch.BoolTensor(np.array([0],dtype =np.uint8))
state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
# get action performed in next state . i will take max score
next_state_actions = net(next_states_v).max(1)[1]
next_state_values = tgt_net.target_model(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
0.0 =
# The Bellman equation is used to compute the expected Q-values for the current state-action pairs.
expected_state_action_values = next_state_values.detach() * gamma + rewards_v
# calculate loss between state_action_values and expected_state_action_values
return nn.MSELoss()(state_action_values, expected_state_action_values)
BATCH_SIZE = 32
BARS_COUNT = 10
EPS_START = 1.0
EPS_FINAL = 0.1
EPS_STEPS = 1000000
GAMMA = 0.99
REPLAY_SIZE = 100000
REPLAY_INITIAL = 10000
REWARD_STEPS = 2
LEARNING_RATE = 0.0001
STATES_TO_EVALUATE = 1000
#这些参数包括批量大小、探索参数、折现因子等,它们显著影响了我们模型的学习动态
writer = SummaryWriter(log_dir='logs')
train_path = "ch08-small-quotesYNDX_150101_151231.csv"
val_path = "ch08-small-quotesYNDX_150101_151231.csv"
我们实验的核心在于训练循环,我们在其中协调学习最优交易策略的过程。通过定义批量生成器和训练批量函数,我们利用贝尔曼方程损失来优化我们的神经网络模型,这是根据环境中观察到的转换更新Q值的关键组成部分。
tp = preprocess(train_path)
vp = preprocess(val_path)
# Creating environments
env = StocksEnv(tp, bars_count=10, commission=0.1, reset_on_close=True, state_1d=False, random_ofs_on_reset=True, reward_on_close=False, volumes=True)
env_val = StocksEnv(vp, bars_count=10, commission=0.1, reset_on_close=True, state_1d=False, random_ofs_on_reset=True, reward_on_close=False, volumes=True)
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initializing neural network models
net = SimpleFFDQN(env.observation_space.shape[0], env.action_space.n).to(device)
tgt_net = ptan.agent.TargetNet(net)
# Initializing action selector and epsilon tracker
selector = ptan.actions.EpsilonGreedyActionSelector(EPS_START)
eps_tracker = EpsilonTracker(selector, EPS_START, EPS_FINAL, EPS_STEPS)
# Initializing DQN agent
agent = ptan.agent.DQNAgent(net, selector, device=device)
# Initializing experience source
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, GAMMA, steps_count=REWARD_STEPS)
# Initializing optimizer
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
# Initializing Engine for training
trainer = Engine(train_batch)
设置exp_source生成器,并设置训练和验证函数。
# Define batch generator function
def batch_generator(buffer: ptan.experience.ExperienceReplayBuffer, initial: int, batch_size: int):
buffer.populate(initial)
while True:
buffer.populate(1)
yield buffer.sample(batch_size)
# Define training batch function
def train_batch(engine, batch):
optimizer.zero_grad()
loss_v = calc_loss(batch=batch, net=net, tgt_net=tgt_net, gamma=GAMMA ** REWARD_STEPS, device=device)
loss_v.backward()
optimizer.step()
eps_tracker.frame(engine.state.iteration)
if getattr(engine.state, "eval_states", None) is None:
eval_states = buffer.sample(STATES_TO_EVALUATE)
eval_states = [np.array(transition.state, copy=False) for transition in eval_states]
engine.state.eval_states = np.array(eval_states, copy=False)
writer.add_scalar("training/loss", loss_v, engine.state.epoch)
return {"loss": loss_v.item(), "epsilon": selector.epsilon}
# Validation function
def validation_run(env, net, episodes=100, device="cpu", epsilon=0.02, commission=0.1):
stats = {metric: [] for metric in METRICS}
for episode in range(episodes):
obs = env.reset()
total_reward = 0.0
position = None
position_steps = None
episode_steps = 0
while True:
obs_v = torch.tensor([obs]).to(device)
out_v = net(obs_v)
action_idx = out_v.max(dim=1)[1].item()
if np.random.random() < epsilon:
action_idx = env.action_space.sample()
action = Actions(action_idx)
close_price = env._state._cur_close()
if action == Actions.Buy and position is None:
position = close_price
position_steps = 0
elif action == Actions.Close and position is not None:
profit = close_price - position - (close_price + position) * commission / 100
profit = 100.0 * profit / position
stats['order_profits'].append(profit)
stats['order_steps'].append(position_steps)
position = None
position_steps = None
obs, reward, done, _ = env.step(action_idx)
total_reward += reward
episode_steps += 1
if position_steps is not None:
position_steps += 1
if done:
if position is not None:
profit = close_price - position - (close_price + position) * commission / 100
profit = 100.0 * profit / position
stats['order_profits'].append(profit)
stats['order_steps'].append(position_steps)
break
stats['episode_reward'].append(total_reward)
stats['episode_steps'].append(episode_steps)
return {key: np.mean(vals) for key, vals in stats.items()}
设置Ignite处理程序并启动训练。
@trainer.on(Events.COMPLETED | Events.EPOCH_COMPLETED(every=10))
def log_training_results(engine):
if engine.state.epoch % 10 == 0:
res = validation_run(env_val, net, episodes=100, device="cpu", epsilon=0.02, commission=0.1)
for key, value in res.items():
writer.add_scalar("Agent Metrics", key, value)
@trainer.on(Events.ITERATION_COMPLETED)
def log_something(engine):
out_dict = engine.state.output
for key, value in out_dict.items():
if value is None:
value = 0.0
elif isinstance(value, torch.Tensor):
value = value.item()
writer.add_scalar(f"Iteration Metrics{engine.state.epoch}/{key}", value, engine.state.iteration)
checkpoint_handler = ModelCheckpoint(dirname='saved_models', filename_prefix='checkpoint', n_saved=2, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'model': net})
trainer.run(batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE), max_epochs=100)
writer.close()
torch.save(net.state_dict(), 'model_state_dict.pth')
res = validation_run(env_val, net, episodes=100, device="cpu", epsilon=0.02, commission=0.1)
print(res)
我们将快速运行模型、训练模型并测试日志。
checkpoint_handler = ModelCheckpoint(dirname='saved_models', filename_prefix='checkpoint', n_saved=2, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'model': net})
trainer.run(batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE),max_epochs=100)
writer.close()
torch.save(net.state_dict(), 'model_state_dict.pth')
res=validation_run(env_val, net, episodes=100, device="cpu", epsilon=0.02, comission=0.1)
print(res)
在一台具有8GB内存的GPU上(具体来说,是一个T2-medium实例)训练模型时,需要12小时才能完成。代理需要大约30,000个额外的周期才能收敛并开始展示令人满意的性能。至于这强化学习是不是真的能赚钱?当然,小海豹是希望朋友们发财的!
———END———
限 时 特 惠: 本站每日持续更新海量各大内部创业教程,一年会员只需98元,全站资源免费下载 点击查看详情
站 长 微 信: qs62318888
主题授权提示:请在后台主题设置-主题授权-激活主题的正版授权,授权购买:RiTheme官网