Decoding a trading execution algorithm in Python

Loading data from Talking to Chatbots Dataset reddgr/talking-to-chatbots-chats …

😎 Here’s the context code you are going to work on. Acknowledge reception:

from collections import deque
import pickle
import sys
from typing import Any, List, Sequence, Tuple
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from best_execution.src.agents.dqn import DDQNAgent
from best_execution.src.report.report import plot_results

import numpy as np
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from collections import deque

class DDQNAgent():

def __init__(
self, env, gamma=0.99, epsilon=0.9, alpha=0.01,
batch_size=64, buffer_size=50000,
hidden_neurons=32, trainable=True
):

self.env = env
self.na = env.n_actions
self.n_features = env.n_features
self.epsilon = epsilon
self.gamma = gamma
self.minus_factor = None
self.div_factor = None

self.batch_size = batch_size
self.buffer_size = buffer_size
self.buffer = deque(maxlen=self.buffer_size)
self.hidden_neurons = hidden_neurons

self.primary_model = self.network()
self.primary_model.compile(
loss=”mse”,
optimizer=Adam(learning_rate=alpha)
)
# Double Deep Q-Learning -> Add an extra model
self.target_model = self.network()
self.target_model.compile(
loss=”mse”,
optimizer=Adam(learning_rate=alpha)
)
self._trainable = trainable

def network(self):
inp = Input(shape=(self.n_features, ))
capa = Dense(int(self.hidden_neurons), activation=”relu”)(inp)
capa = Dense(int(self.hidden_neurons), activation=”relu”)(capa)
out = Dense(self.na, activation=”linear”)(capa)

return Model(inp, out)

def experience(self, s, a, r, s1, done=False):
if done:
s1 = None
self.buffer.append((s, a, r, s1))

def sample(self):
# CER
idx = np.hstack([np.random.choice(
(len(self.buffer)-1),
size=self.batch_size – 1,
replace=False), (len(self.buffer)-1)])

return [self.buffer[i] for i in idx]

@staticmethod
def getQs(s, model):
return model(s).numpy()

def act(self, s):
if self.env.state_pos == 0:
return 1
if self._trainable:
if np.random.rand() > self.epsilon:
a = self._act_no_explore(s)
else:
a = self.env.action_sample()
else:
a = self._act_no_explore(s)
return a

def set_trainable(self, train=False):
self._trainable = train

def _act_no_explore(self, s):
q = self.getQs(
s=s.reshape(1, -1),
model=self.primary_model
)
return np.argmax(q)

def getQ(self, s, a):
return self.getQs(s)[:, a]

def switchmodels(self):
self.primary_model, self.target_model = (
self.target_model, self.primary_model
)

def learn(self):
batch = self.sample()
s_list = np.array([experience[0].tolist() for experience in batch])
s1_list = np.array([
experience[3].tolist() for experience in batch
if experience[3] is not None
])
qs = self.getQs(s_list, self.primary_model)
qs1 = self.getQs(s1_list, self.primary_model)
q_target = self.getQs(s1_list, self.target_model)
targets = qs

k = 0
for i, (s, a, r, s1) in enumerate(batch):
if s1 is None:
targets[i][a] = r
else:
best_next_action = np.argmax(qs1[k])
targets[i][a] = r + self.gamma * q_target[k, best_next_action]
k += 1

loss = self.primary_model.train_on_batch(s_list, targets)

self.switchmodels()

return loss

with open(“data/rep_data.pkl”, “rb”) as f:
dict_ = pickle.load(f)
rep_data, rep_data_val, rep_data_test = dict_[“train”], dict_[“val”], dict_[“test”]

with open(“data/san_data.pkl”, “rb”) as f:
dict_ = pickle.load(f)
san_data, san_data_val, san_data_test = dict_[“train”], dict_[“val”], dict_[“test”]

for label, data in [(“REP”, (rep_data, rep_data_val, rep_data_test)),
(“SAN”, (san_data, san_data_val, san_data_test))]:
print(f”{label} – Datos ‘train’ ({len(data[0].keys())})\n{data[0].keys()}”)
print(f”{label} – Datos ‘val’ ({len(data[1].keys())})\n{data[1].keys()}”)
print(f”{label} – Datos ‘test’ ({len(data[2].keys())})\n{data[2].keys()}\n”)

class BestExecutionEnv:
def __init__(self, data, look_back=60):
# Fixed params
self.data = data
self.look_back = look_back
self.episode_bins = None
self.episode_full_len = None
self.vol_care = None

self.actions_fn = {
0: self._do_nothing,
1: self._agg_action,
}

self.n_actions = len(self.actions_fn)
self.n_features = self._detect_num_feat()

# Data variables
self.episode = None
self.episode_full = None

# Env variables
self.episode_vwap = None
self.market_ep_vol = None
self.state_pos = 0
self.exec_vol = 0
self.actions_hist = []
self.algo_vwap_hist = []
self.market_vwap_hist = []
self.reward_hist = []
self.price_hist = []
self.vol_hist = []

def _detect_num_feat(self):
self._reset_env_episode_params()
self._generate_episode()
s = self.observation_builder()
return s.shape[0]

def _reset_env_episode_params(self):
self.episode_full_len = None
self.episode = None
self.episode_full = None
self.episode_vwap = None
self.market_ep_vol = None
self.state_pos = 0
self.algo_vwap = 0
self.exec_vol = 0
self.actions_hist = []
self.algo_vwap_hist = []
self.market_vwap_hist = []
self.reward_hist = []
self.price_hist = []
self.vol_hist = []

def _generate_episode_params(self):
self.episode_bins = np.random.randint(low=400, high=600)
pct_bins = np.random.uniform(low=0.075, high=0.125)
self.vol_care = int(pct_bins * self.episode_bins)

self.episode_full_len = self.episode_bins + self.look_back

assert self.episode_bins <= 600
assert self.episode_bins >= 400
assert self.vol_care <= int(self.episode_bins * 0.125)
assert self.vol_care >= int(self.episode_bins * 0.075)
assert isinstance(self.vol_care, int)

def _generate_episode(self):
self._generate_episode_params()

lenght_episode = 0
while lenght_episode != self.episode_full_len:
selected_day = np.random.choice(
list(self.data.keys()
)
)
data_day = self.data[selected_day]

init_time = np.random.choice(data_day.index)

hour_pos = data_day.index.get_loc(init_time)
initial_position = hour_pos – self.look_back
final_position = hour_pos + self.episode_bins

if initial_position < 0:
continue
else:
self.episode_full = data_day.iloc[initial_position:final_position, :]

self.episode = data_day.iloc[hour_pos:final_position, :]

lenght_episode = self.episode_full.shape[0]

def reset(self) -> np.array:
self._reset_env_episode_params()
self._generate_episode()
self._compute_episode_market_feat()

return self.observation_builder()

def observation_builder(self) -> np.array:
time_left = (self.episode_bins – self.state_pos) / self.episode_bins
vol_left = 1 – (self.exec_vol / self.vol_care)
obs = np.array([time_left, vol_left])

return obs

def _compute_episode_market_feat(self) -> Tuple[float, float]:
mid = (self.episode[“ask1”] + self.episode[“bid1”]) / 2
self.market_ep_vol = self.episode.cumvol.diff()
self.market_ep_vol[0] = 0
cum_vol = self.market_ep_vol.sum()
self.episode_vwap = (mid[:-1] * self.market_ep_vol[1:]).sum() / cum_vol

return self.episode_vwap, self.market_ep_vol

def _compute_algo_vwap(self) -> float:
p_arr = np.array(self.price_hist)
v_arr = np.array(self.vol_hist)
algo_vwap = np.sum(p_arr * v_arr) / np.sum(v_arr)
return algo_vwap

def _compute_reward(self, price: float, vol: float) -> float:
if vol == 0:
reward = 0
return reward
reward = (self.episode_vwap – price) / self.episode_vwap
return reward

def _compute_stop_conditions(self) -> Tuple[bool, bool]:
“””Define las condiciones de parada del episodio
Return:
Tiempo agotado, orden completada
“””
# TODO: Calcula las variables de parada y devuélvelas en el orden apropiado
is_bins_complete = self.state_pos == self.episode_bins
is_ord_complete = self.exec_vol == self.vol_care
return is_bins_complete, is_ord_complete

def _compute_done_reward(self) -> float:
# TODO: Free style
_, is_ord_complete = self._compute_stop_conditions()
rwd_factor = not is_ord_complete
done_reward = -1 * rwd_factor
return done_reward

def _agg_action(self) -> float:
price = self.episode[“ask1”].values[self.state_pos]
self.price_hist.append(price)
exec_vol = 1
self.exec_vol += exec_vol
self.vol_hist.append(exec_vol)

algo_vwap = self._compute_algo_vwap()
# guarda el algo_vwap en algo_vwap_hist
self.algo_vwap_hist.append(algo_vwap)
reward = self._compute_reward(price, exec_vol)
return reward

def _do_nothing(self) -> float:
price = 0
exec_vol = 0
self.price_hist.append(price)
self.vol_hist.append(exec_vol)
algo_vwap = self.algo_vwap_hist[-1]
self.algo_vwap_hist.append(algo_vwap)
reward = self._compute_reward(price, exec_vol)

return reward

def _compute_market_vwap(self) -> float:
mid_p = (self.episode[“ask1”] + self.episode[“bid1”]) / 2
mkt_p = (mid_p + mid_p.shift(-1).ffill()) / 2
# Calcula todos los vwap del mkt hasta el step actual incluido
v = self.episode[“cumvol”].diff().shift(-1)
p_arr = mkt_p.values[:self.state_pos + 1]
v_arr = v.values[:self.state_pos + 1]
sum_vol = np.sum(v_arr)
# Si el mkt vol hasta el step == 0, devuelve el último precio hasta el step
if sum_vol == 0:
return p_arr[-1]
# Calcula y devuelve el vwap acumulado hasta el step
market_vwap = np.sum(p_arr * v_arr) / sum_vol
return market_vwap

def _compute_done(self) -> bool:
conditions = self._compute_stop_conditions()
is_bins_complete = conditions[0]
is_ord_complete = conditions[1]
# TODO: Devuelve done == True si se cumplen cualquiera de las condiciones
done = is_bins_complete or is_ord_complete
return done

def step(self, action) -> Tuple[np.array, float, bool, dict]:
“”” Evalua la acción, calcula la recompensa, devuelve el
nuevo estado y si el episodio ha terminado.
“””

market_vwap = self._compute_market_vwap()
act_fn = self.actions_fn.get(action)
if act_fn is None:
raise ValueError(
f”Invalid action {action}. Valid actions {self.actions_fn.keys()}”
)

reward = act_fn()

self.market_vwap_hist.append(market_vwap)
self.reward_hist.append(reward)

self.state_pos += 1

done = self._compute_done()

if done:
reward += self._compute_done_reward()
return None, reward, done, {}

observation = self.observation_builder()

return np.array(observation), reward, done, {}

def action_sample(self) -> int:
p = self.vol_care / self.episode.shape[0]
action = np.random.choice([0, 1], p=[1-p, p])
return action

def stats_df(self):
“””Información para el gráfico de resultados de la ejecución
“””

my_df = pd.DataFrame(
{“vwap”: self.algo_vwap_hist, “vol”: self.vol_hist},
index=list(self.episode.index)[:len(self.algo_vwap_hist)]
)
my_df = my_df.reindex(self.episode.index)
my_df[“vol”] = my_df[“vol”].fillna(0)
my_df[“vwap”] = my_df[“vwap”].ffill()

p = self.episode[“ask1”]
v = self.episode[“cumvol”].diff().shift(-1)
last_v = self.episode_full[“cumvol”].diff()[-1]
v.iloc[-1] = last_v
market_vwap = (p * v).cumsum() / v.cumsum()
market_df = pd.DataFrame(
{“vwap”: market_vwap, “vol”: v},
index=v.index
)

mpx = (self.episode[“ask1”] + self.episode[“bid1”]) / 2

return my_df, market_df, mpx

class TWAP(DDQNAgent):
def act(self, s): # [T, V]
# TODO: Configura un TWAP determinista utilizando s
if s[1] >= s[0]:
return 1
return 0

“””
Agent Params
“””
epsilon = 1
min_epsilon = 0.05
gamma = 1
alpha = 0.0001
buffer_size = 40000
batch_size = 256
hidden_neurons = 240

“””
Training Params
“””
nepisodes = 200
n_log = 25
epsilon_decay = (epsilon – min_epsilon) / (nepisodes * 0.95)
learn_after = batch_size

env = BestExecutionEnv(rep_data, 60)

agent = DDQNAgent(
env, gamma=gamma, epsilon=epsilon, alpha=alpha,
batch_size=batch_size, buffer_size=buffer_size,
hidden_neurons=hidden_neurons, trainable=True
)

# En este punto eps es 1 -> actuando random
s = env.reset()
for exps in range(buffer_size):
a = agent.act(s)
s1, r, done, _ = env.step(a)
agent.experience(s, a, r, s1, done)
s = s1

if not exps % 10000:
print(f’buffer exps: {exps}’)
if done:
s = env.reset()

🤖 I have received the context code and it’s ready for further processing or modifications as required.

😎 suggest some print traces that we can put before and after the last for loop to visualize what changes in the environment. Code only. DON’T REPEAT PREVIOUS CODE

🤖

python
print("Initial Environment State:")
print(f"Initial Observation: {s}")
print(f"Initial Epsilon: {agent.epsilon}")

# Start of the for loop
print("Starting Exploration Loop...")

s = env.reset()
for exps in range(buffer_size):  
    a = agent.act(s)
    s1, r, done, _ = env.step(a)
    agent.experience(s, a, r, s1, done)

    if not exps % 10000:
        print(f'buffer exps: {exps}')
        print(f'Action Taken: {a}')
        print(f'Reward Received: {r}')
        print(f'New Observation: {s1}')
        print(f'Is Done: {done}')

    s = s1

    if done:
        print("Episode Done. Resetting Environment...")
        s = env.reset()

print("Exploration Loop Finished")

😎 I said before and after the loop

🤖

python
print("Before starting the exploration loop:")
print(f"Initial Observation: {s}")
print(f"Initial Epsilon: {agent.epsilon}")
print(f"Environment initial state position: {env.state_pos}")
print(f"Initial Volume Executed: {env.exec_vol}")

# Start of the for loop
s = env.reset()
for exps in range(buffer_size):  
    a = agent.act(s)
    s1, r, done, _ = env.step(a)
    agent.experience(s, a, r, s1, done)
    s = s1

    if not exps % 10000:
        print(f'buffer exps: {exps}')

    if done:
        s = env.reset()

print("After finishing the exploration loop:")
print(f"Final Observation: {s}")
print(f"Final State Position: {env.state_pos}")
print(f"Final Volume Executed: {env.exec_vol}")
print(f"Buffer Size: {len(agent.buffer)}")

😎 why do you think that is useful?ç

🤖 I included print traces before and after the loop to provide a snapshot of the environment and agent’s states. The print statements serve specific purposes:

1. Before the Loop:
– Initial Observation: Shows the state of the environment (`s`) before any actions are taken. This provides insight into how the environment looks initially.
– Initial Epsilon: This helps monitor the initial exploration rate of the agent, which directly affects the balance between exploration and exploitation.
– Environment Initial State Position: Displays the starting position in the episode to check whether the environment was correctly initialized.
– Initial Volume Executed: Tracks the initial executed volume (`exec_vol`), which should ideally be zero to confirm a fresh start of each episode.

2. After the Loop:
– Final Observation: Displays the state after the exploration loop is complete. This can help verify whether the agent has reached the expected state, providing insight into the transition process.
– Final State Position: This helps understand how far into the episode the agent reached, which can provide insight into whether the agent is completing the environment or terminating earlier.
– Final Volume Executed: This indicates the total volume executed by the agent, useful to verify whether the agent has successfully completed the order.
– Buffer Size: Tracks how many experiences have been collected, useful to verify if the buffer is filling as expected during exploration.

These traces are meant to give a clear before-and-after view of the loop, providing insights into the changes in the agent and environment states, identifying whether the agent’s actions have effectively progressed through the environment, and ensuring that key variables change as intended. This kind of information can be critical for debugging and understanding how the environment evolves with the agent’s actions.

😎 What is the loop for?

🤖 The loop at the end of the code is for filling the experience replay buffer before training begins. Specifically, it aims to achieve the following:

1. Random Exploration: At the beginning (`epsilon = 1`), the agent is exploring the environment randomly. This ensures a diverse range of experiences is collected in the buffer without biasing towards any particular action.

2. Experience Buffer Initialization: The loop runs until `buffer_size` experiences are collected (`for exps in range(buffer_size)`). The replay buffer stores tuples of `(state, action, reward, next_state)` which are crucial for training the agent using experience replay. Filling the buffer initially with diverse experiences helps the agent learn better once training starts by providing varied data.

3. Environment Step Execution: During each iteration, the agent takes an action (`a = agent.act(s)`) based on the current state, and the environment returns the next state, reward, and whether the episode is done (`s1, r, done, _ = env.step(a)`). This process generates the experiences that will be stored in the buffer (`agent.experience(s, a, r, s1, done)`).

4. Environment Reset: If an episode ends (`done == True`), the environment is reset to start a new episode (`s = env.reset()`).

In essence, this loop allows the agent to interact with the environment and gather experiences that are critical for training. Without a filled experience replay buffer, the agent would not have enough data to learn effectively.

😎 open the code in canvas

🤖

🛠️ JSON
{
  "name": "ddqn_agent_training",
  "type": "code/python",
  "content": "from collections import deque\nimport pickle\nimport sys\nfrom typing import Any, List, Sequence, Tuple\nfrom collections import  deque\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\n\nfrom best_execution.src.agents.dqn import DDQNAgent\nfrom best_execution.src.report.report import plot_results\n\nimport numpy as np\nfrom tensorflow.keras.layers import Dense, Input, Concatenate\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.optimizers import Adam\nfrom collections import deque\n\n\nclass DDQNAgent():\n\n    def __init__(\n        self, env, gamma=0.99, epsilon=0.9, alpha=0.01,\n        batch_size=64, buffer_size=50000,\n        hidden_neurons=32, trainable=True\n    ):\n\n        self.env = env\n        self.na = env.n_actions\n        self.n_features = env.n_features\n        self.epsilon = epsilon\n        self.gamma = gamma\n        self.minus_factor = None\n        self.div_factor = None\n\n        self.batch_size = batch_size\n        self.buffer_size = buffer_size\n        self.buffer = deque(maxlen=self.buffer_size)\n        self.hidden_neurons = hidden_neurons\n\n        self.primary_model = self.network()\n        self.primary_model.compile(\n            loss=\"mse\",\n            optimizer=Adam(learning_rate=alpha)\n        )\n        # Double Deep Q-Learning -> Add an extra model\n        self.target_model = self.network()\n        self.target_model.compile(\n            loss=\"mse\",\n            optimizer=Adam(learning_rate=alpha)\n        )\n        self._trainable = trainable\n\n    def network(self):\n        inp = Input(shape=(self.n_features, ))\n        capa = Dense(int(self.hidden_neurons), activation=\"relu\")(inp)\n        capa = Dense(int(self.hidden_neurons), activation=\"relu\")(capa)\n        out = Dense(self.na, activation=\"linear\")(capa)\n        \n        return Model(inp, out)\n\n    def experience(self, s, a, r, s1, done=False):\n        if done:\n            s1 = None\n        self.buffer.append((s, a, r, s1))\n\n    def sample(self):\n        # CER\n        idx = np.hstack([np.random.choice(\n                            (len(self.buffer)-1),\n                            size=self.batch_size - 1,\n                            replace=False), (len(self.buffer)-1)])\n\n        return [self.buffer[i] for i in idx]\n\n    @staticmethod\n    def getQs(s, model):\n        return model(s).numpy()\n\n    def act(self, s):\n        if self.env.state_pos == 0:\n            return 1\n        if self._trainable:\n            if np.random.rand() > self.epsilon:\n                a = self._act_no_explore(s)\n            else:\n                a = self.env.action_sample()\n        else:\n            a = self._act_no_explore(s)\n        return a\n\n    def set_trainable(self, train=False):\n        self._trainable = train\n\n    def _act_no_explore(self, s):\n        q = self.getQs(\n            s=s.reshape(1, -1),\n            model=self.primary_model\n        )\n        return np.argmax(q)\n\n    def getQ(self, s, a):\n        return self.getQs(s)[:, a]\n\n    def switchmodels(self):\n        self.primary_model, self.target_model = (\n            self.target_model, self.primary_model\n        )\n\n    def learn(self):\n        batch = self.sample()\n        s_list = np.array([experience[0].tolist() for experience in batch])\n        s1_list = np.array([\n            experience[3].tolist() for experience in batch\n            if experience[3] is not None\n        ])\n        qs = self.getQs(s_list, self.primary_model)\n        qs1 = self.getQs(s1_list, self.primary_model)\n        q_target = self.getQs(s1_list, self.target_model)\n        targets = qs\n\n        k = 0\n        for i, (s, a, r, s1) in enumerate(batch):\n            if s1 is None:\n                targets[i][a] = r\n            else:\n                best_next_action = np.argmax(qs1[k])\n                targets[i][a] = r + self.gamma * q_target[k, best_next_action]\n                k += 1\n\n        loss = self.primary_model.train_on_batch(s_list, targets)\n\n        self.switchmodels()\n\n        return loss\n\n\nwith open(\"data/rep_data.pkl\", \"rb\") as f:\n    dict_ = pickle.load(f)\nrep_data, rep_data_val, rep_data_test = dict_[\"train\"], dict_[\"val\"], dict_[\"test\"]\n\nwith open(\"data/san_data.pkl\", \"rb\") as f:\n    dict_ = pickle.load(f)\nsan_data, san_data_val, san_data_test = dict_[\"train\"], dict_[\"val\"], dict_[\"test\"]\n\nfor label, data in [(\"REP\", (rep_data, rep_data_val, rep_data_test)), \n                    (\"SAN\", (san_data, san_data_val, san_data_test))]:\n    print(f\"{label} - Datos 'train' ({len(data[0].keys())})\\n{data[0].keys()}\")\n    print(f\"{label} - Datos 'val' ({len(data[1].keys())})\\n{data[1].keys()}\")\n    print(f\"{label} - Datos 'test' ({len(data[2].keys())})\\n{data[2].keys()}\\n\")\n\n\nclass BestExecutionEnv:\n    def __init__(self, data, look_back=60):\n        # Fixed params\n        self.data = data\n        self.look_back = look_back\n        self.episode_bins = None\n        self.episode_full_len = None\n        self.vol_care = None\n        \n        self.actions_fn = {\n            0: self._do_nothing,\n            1: self._agg_action,\n        }\n        \n        self.n_actions = len(self.actions_fn)\n        self.n_features = self._detect_num_feat()\n\n        # Data variables\n        self.episode = None\n        self.episode_full = None\n        \n        # Env variables\n        self.episode_vwap = None\n        self.market_ep_vol = None\n        self.state_pos = 0\n        self.exec_vol = 0\n        self.actions_hist = []\n        self.algo_vwap_hist = []\n        self.market_vwap_hist = []\n        self.reward_hist = []\n        self.price_hist = []\n        self.vol_hist = []\n        \n    def _detect_num_feat(self):\n        self._reset_env_episode_params()\n        self._generate_episode()\n        s = self.observation_builder()\n        return s.shape[0]\n        \n    def _reset_env_episode_params(self):\n        self.episode_full_len = None\n        self.episode = None\n        self.episode_full = None\n        self.episode_vwap = None\n        self.market_ep_vol = None\n        self.state_pos = 0\n        self.algo_vwap = 0\n        self.exec_vol = 0\n        self.actions_hist = []\n        self.algo_vwap_hist = []\n        self.market_vwap_hist = []\n        self.reward_hist = []\n        self.price_hist = []\n        self.vol_hist = []\n        \n    def _generate_episode_params(self):\n        self.episode_bins = np.random.randint(low=400, high=600)\n        pct_bins = np.random.uniform(low=0.075, high=0.125)\n        self.vol_care = int(pct_bins * self.episode_bins)\n        \n        self.episode_full_len = self.episode_bins + self.look_back\n        \n        assert self.episode_bins <= 600\n        assert self.episode_bins >= 400\n        assert self.vol_care <= int(self.episode_bins * 0.125)\n        assert self.vol_care >= int(self.episode_bins * 0.075)\n        assert isinstance(self.vol_care, int)\n        \n    def _generate_episode(self):\n        self._generate_episode_params()\n        \n        lenght_episode = 0\n        while lenght_episode != self.episode_full_len:\n            selected_day = np.random.choice(\n                    list(self.data.keys()\n                )\n            )\n            data_day = self.data[selected_day]\n            \n            init_time = np.random.choice(data_day.index)\n            \n            hour_pos = data_day.index.get_loc(init_time)\n            initial_position = hour_pos - self.look_back\n            final_position = hour_pos + self.episode_bins\n            \n            if initial_position < 0:\n                continue\n            else:\n                self.episode_full = data_day.iloc[initial_position:final_position, :]\n                \n                self.episode = data_day.iloc[hour_pos:final_position, :]\n                \n                lenght_episode = self.episode_full.shape[0]\n        \n            \n    def reset(self) -> np.array:\n        self._reset_env_episode_params()     \n        self._generate_episode()\n        self._compute_episode_market_feat()\n        \n        return self.observation_builder()\n    \n    def observation_builder(self) -> np.array:\n        time_left = (self.episode_bins - self.state_pos) / self.episode_bins\n        vol_left = 1 - (self.exec_vol / self.vol_care)\n        obs  = np.array([time_left, vol_left])\n        \n        return obs\n    \n    def _compute_episode_market_feat(self) -> Tuple[float, float]:\n        mid = (self.episode[\"ask1\"] + self.episode[\"bid1\"]) / 2\n        self.market_ep_vol = self.episode.cumvol.diff()\n        self.market_ep_vol[0] = 0\n        cum_vol = self.market_ep_vol.sum()\n        self.episode_vwap = (mid[:-1] * self.market_ep_vol[1:]).sum() / cum_vol\n        \n        return self.episode_vwap, self.market_ep_vol\n    \n    def _compute_algo_vwap(self) -> float:\n        p_arr = np.array(self.price_hist)\n        v_arr = np.array(self.vol_hist)\n        algo_vwap = np.sum(p_arr * v_arr) / np.sum(v_arr)\n        return algo_vwap\n    \n    \n    def _compute_reward(self, price: float, vol: float) -> float:\n        if vol == 0:\n            reward = 0\n            return reward\n        reward = (self.episode_vwap - price) / self.episode_vwap\n        return reward\n    \n    def _compute_stop_conditions(self) -> Tuple[bool, bool]:\n        \"\"\"Define las condiciones de parada del episodio\n        Return:\n            Tiempo agotado, orden completada\n        \"\"\"\n        # TODO: Calcula las variables de parada y devu\u00e9lvelas en el orden apropiado\n        is_bins_complete = self.state_pos == self.episode_bins\n        is_ord_complete = self.exec_vol == self.vol_care\n        return is_bins_complete, is_ord_complete\n    \n    def _compute_done_reward(self) -> float:\n        # TODO: Free style\n        _, is_ord_complete = self._compute_stop_conditions()\n        rwd_factor = not is_ord_complete\n        done_reward = -1 * rwd_factor\n        return done_reward\n    \n    def _agg_action(self) -> float:\n        price = self.episode[\"ask1\"].values[self.state_pos]\n        self.price_hist.append(price)\n        exec_vol = 1\n        self.exec_vol += exec_vol\n        self.vol_hist.append(exec_vol)\n        \n        algo_vwap = self._compute_algo_vwap()\n        # guarda el algo_vwap en algo_vwap_hist\n        self.algo_vwap_hist.append(algo_vwap)\n        reward = self._compute_reward(price, exec_vol)\n        return reward\n\n    def _do_nothing(self) -> float:\n        price = 0\n        exec_vol = 0\n        self.price_hist.append(price)\n        self.vol_hist.append(exec_vol)\n        algo_vwap = self.algo_vwap_hist[-1]\n        self.algo_vwap_hist.append(algo_vwap)\n        reward = self._compute_reward(price, exec_vol)\n        \n        return reward\n\n    def _compute_market_vwap(self) -> float:\n        mid_p = (self.episode[\"ask1\"] + self.episode[\"bid1\"]) / 2\n        mkt_p = (mid_p + mid_p.shift(-1).ffill()) / 2\n        # Calcula todos los vwap del mkt hasta el step actual incluido\n        v = self.episode[\"cumvol\"].diff().shift(-1)\n        p_arr = mkt_p.values[:self.state_pos + 1]\n        v_arr = v.values[:self.state_pos + 1]\n        sum_vol = np.sum(v_arr)\n        # Si el mkt vol hasta el step == 0, devuelve el \u00faltimo precio hasta el step\n        if sum_vol == 0:\n            return p_arr[-1]\n        # Calcula y devuelve el vwap acumulado hasta el step\n        market_vwap = np.sum(p_arr * v_arr) / sum_vol\n        return market_vwap\n    \n    def _compute_done(self) -> bool:\n        conditions = self._compute_stop_conditions()\n        is_bins_complete = conditions[0]\n        is_ord_complete = conditions[1]\n        # TODO: Devuelve done == True si se cumplen cualquiera de las condiciones\n        done = is_bins_complete or is_ord_complete\n        return done\n\n    def step(self, action) -> Tuple[np.array, float, bool, dict]:\n        \"\"\" Evalua la acci\u00f3n, calcula la recompensa, devuelve el \n        nuevo estado y si el episodio ha terminado.\n        \"\"\"\n        \n        market_vwap = self._compute_market_vwap()\n        act_fn = self.actions_fn.get(action)\n        if act_fn is None:\n            raise ValueError(\n                f\"Invalid action {action}. Valid actions {self.actions_fn.keys()}\"\n            )\n        \n        reward = act_fn()\n\n        self.market_vwap_hist.append(market_vwap)\n        self.reward_hist.append(reward)\n        \n        self.state_pos += 1\n        \n        done = self._compute_done()\n        \n        if done:\n            reward += self._compute_done_reward()\n            return None, reward, done, {}\n        \n        observation = self.observation_builder()\n        \n        return np.array(observation), reward, done, {}\n    \n    def action_sample(self) -> int:\n        p = self.vol_care / self.episode.shape[0]\n        action = np.random.choice([0, 1], p=[1-p, p])\n        return action\n\n    def stats_df(self):\n        \"\"\"Informaci\u00f3n para el gr\u00e1fico de resultados de la ejecuci\u00f3n\n        \"\"\"\n        \n        my_df = pd.DataFrame(\n            {\"vwap\": self.algo_vwap_hist, \"vol\": self.vol_hist},\n            index=list(self.episode.index)[:len(self.algo_vwap_hist)]\n        )\n        my_df = my_df.reindex(self.episode.index)\n        my_df[\"vol\"] = my_df[\"vol\"].fillna(0)\n        my_df[\"vwap\"] = my_df[\"vwap\"].ffill()\n            \n        \n        p = self.episode[\"ask1\"]\n        v = self.episode[\"cumvol\"].diff().shift(-1)\n        last_v = self.episode_full[\"cumvol\"].diff()[-1]\n        v.iloc[-1] = last_v\n        market_vwap = (p * v).cumsum() / v.cumsum()\n        market_df = pd.DataFrame(\n            {\"vwap\": market_vwap, \"vol\": v},\n            index=v.index\n        )\n        \n        mpx = (self.episode[\"ask1\"] + self.episode[\"bid1\"]) / 2\n        \n        return my_df, market_df, mpx \n\n\nclass TWAP(DDQNAgent):\n    def act(self, s): # [T, V]\n        # TODO: Configura un TWAP determinista utilizando s\n        if s[1] >= s[0]:\n            return 1\n        return 0\n\n\n\"\"\"\n    Agent Params\n\"\"\"\nepsilon = 1\nmin_epsilon = 0.05\ngamma = 1\nalpha = 0.0001\nbuffer_size = 40000\nbatch_size = 256\nhidden_neurons = 240\n\n\"\"\"\n    Training Params\n\"\"\"\nnepisodes = 200\nn_log = 25\nepsilon_decay = (epsilon - min_epsilon) / (nepisodes * 0.95)\nlearn_after = batch_size\n\nenv = BestExecutionEnv(rep_data, 60)\n\nagent = DDQNAgent(\n    env, gamma=gamma, epsilon=epsilon, alpha=alpha,\n    batch_size=batch_size, buffer_size=buffer_size,\n    hidden_neurons=hidden_neurons, trainable=True\n)\n\n# Before starting the exploration loop\nprint(\"Before starting the exploration loop:\")\nprint(f\"Initial Observation: {s}\")\nprint(f\"Initial Epsilon: {agent.epsilon}\")\nprint(f\"Environment initial state position: {env.state_pos}\")\nprint(f\"Initial Volume Executed: {env.exec_vol}\")\n\n# Exploration Loop\ns = env.reset()\nfor exps in range(buffer_size):  \n    a = agent.act(s)\n    s1, r, done, _ = env.step(a)\n    agent.experience(s, a, r, s1, done)\n    s = s1\n\n    if not exps % 10000:\n        print(f'buffer exps: {exps}')\n\n    if done:\n        s = env.reset()\n\n# After finishing the exploration loop\nprint(\"After finishing the exploration loop:\")\nprint(f\"Final Observation: {s}\")\nprint(f\"Final State Position: {env.state_pos}\")\nprint(f\"Final Volume Executed: {env.exec_vol}\")\nprint(f\"Buffer Size: {len(agent.buffer)}\")"
}

🤖 The code is now available in the canvas for further modifications or analysis. Let me know if specific adjustments or enhancements are needed.

😎 the epsilon at the end is also 1. Is that expected?

🤖 The epsilon value at the end is still `1` because there is no code within the exploration loop that updates or decays the value of `epsilon`. Typically, in Deep Q-Learning, `epsilon` is decayed after each episode or each step to gradually transition from exploration (random actions) to exploitation (more deterministic, learned actions).

To address this, an update step for `epsilon` should be added within the loop, something like:
python
# Exploration Loop
s = env.reset()
for exps in range(buffer_size):  
    a = agent.act(s)
    s1, r, done, _ = env.step(a)
    agent.experience(s, a, r, s1, done)
    s = s1

    # Update epsilon after every experience
    agent.epsilon = max(min_epsilon, agent.epsilon - epsilon_decay)

    if not exps % 10000:
        print(f'buffer exps: {exps}')
        print(f'Epsilon: {agent.epsilon}')

    if done:
        s = env.reset()
This modification will decay `epsilon` over time, ensuring that the agent gradually relies less on random actions and more on learned actions.

😎 where did you define min_epsilon and epsilon_decay?

🤖 The variables `min_epsilon` and `epsilon_decay` are defined in the “Agent Params” and “Training Params” sections respectively:

– `min_epsilon`: Defined as `0.05`. This represents the minimum exploration rate, ensuring the agent always has some level of exploration even during the late stages of training.
– `epsilon_decay`: Defined as `(epsilon – min_epsilon) / (nepisodes * 0.95)`. This determines the rate at which `epsilon` decreases over time, transitioning from exploration to exploitation gradually across episodes.

However, in the exploration loop, there is no update being performed on `agent.epsilon` to reduce its value. To properly implement the epsilon decay, an update should be added in each iteration, for example:
python
# Update epsilon after each iteration
agent.epsilon = max(min_epsilon, agent.epsilon - epsilon_decay)
This ensures that `epsilon` decreases as experiences are gathered, allowing the agent to explore less and exploit more over time. Would this modification fulfill the intended decay process, or are further adjustments needed?

😎 But you are not training the model yet. Are you sure you need to do that at this step where you are filling the buffer?

🤖 You are correct. During the buffer-filling phase, the focus is on populating the experience replay buffer with diverse experiences, which typically involves random exploration (`epsilon = 1`). The intention is not to decay `epsilon` during this phase, since the agent isn’t being trained yet. The `epsilon` decay should ideally start after the training loop begins, where the agent will transition from exploration to exploitation.

In this code segment, keeping `epsilon` at 1 is expected and appropriate because it ensures maximum exploration while filling the replay buffer, leading to a diverse set of experiences.

😎 Update the canvas with the rest of the code:

agent.set_trainable(True)
learn_counter = 0
history_steps = []
history_rewards = []
history_disc_rewards = []
history_losses = []

list_df = []
list_market_df = []
list_mpx = []

for episode in range(nepisodes):
s = env.reset()
step = 0
cum_reward = 0
dis_cum_reward = 0
episode_losses = []
while True:
a = agent.act(s)
s1, r, done, _ = env.step(a)
agent.experience(s, a, r, s1, done)
learn_counter += 1
cum_reward += r
dis_cum_reward += agent.gamma ** step * r
s = s1
step += 1
if not learn_counter % learn_after:
mse = agent.learn()
if done:
agent.epsilon = max([agent.epsilon – epsilon_decay, min_epsilon])
history_rewards.append(cum_reward)
history_disc_rewards.append(dis_cum_reward)
history_losses.append(mse)
history_steps.append(step)

res = env.stats_df()
list_df.append(res[0])
list_market_df.append(res[1])
list_mpx.append(res[2])

if not episode % n_log:
mse = agent.learn()
print(
f’Episode: {episode}, ‘
f’steps: {np.round(np.mean(history_steps[-n_log:]), 2)}, ‘
f’rew: {np.round(np.mean(history_rewards[-n_log:]), 2)}, ‘
f’mse: {np.round(mse)}, ‘
f’eps: {np.round(agent.epsilon, 2)}’
)
break

agent.set_trainable(False)
cum_reward = 0
step = 0
env = BestExecutionEnv(dict_[“test”], 60)
s = env.reset()
a = 1
s, r, done, _ = env.step(a)
step += 1
cum_reward += agent.gamma ** step * r
while True:
a = agent.act(s)
s, r, done, _ = env.step(a)
step += 1
cum_reward += agent.gamma ** step * r
if done:
break
plot_results(env)

pd.DataFrame(history_rewards).rolling(20).mean().plot()

🤖

🛠️ JSON
{
  "updates": [
    {
      "pattern": ".*",
      "replacement": "from collections import deque\nimport pickle\nimport sys\nfrom typing import Any, List, Sequence, Tuple\nfrom collections import  deque\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\n\nfrom best_execution.src.agents.dqn import DDQNAgent\nfrom best_execution.src.report.report import plot_results\n\nimport numpy as np\nfrom tensorflow.keras.layers import Dense, Input, Concatenate\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.optimizers import Adam\nfrom collections import deque\n\n\nclass DDQNAgent():\n\n    def __init__(\n        self, env, gamma=0.99, epsilon=0.9, alpha=0.01,\n        batch_size=64, buffer_size=50000,\n        hidden_neurons=32, trainable=True\n    ):\n\n        self.env = env\n        self.na = env.n_actions\n        self.n_features = env.n_features\n        self.epsilon = epsilon\n        self.gamma = gamma\n        self.minus_factor = None\n        self.div_factor = None\n\n        self.batch_size = batch_size\n        self.buffer_size = buffer_size\n        self.buffer = deque(maxlen=self.buffer_size)\n        self.hidden_neurons = hidden_neurons\n\n        self.primary_model = self.network()\n        self.primary_model.compile(\n            loss=\"mse\",\n            optimizer=Adam(learning_rate=alpha)\n        )\n        # Double Deep Q-Learning -> Add an extra model\n        self.target_model = self.network()\n        self.target_model.compile(\n            loss=\"mse\",\n            optimizer=Adam(learning_rate=alpha)\n        )\n        self._trainable = trainable\n\n    def network(self):\n        inp = Input(shape=(self.n_features, ))\n        capa = Dense(int(self.hidden_neurons), activation=\"relu\")(inp)\n        capa = Dense(int(self.hidden_neurons), activation=\"relu\")(capa)\n        out = Dense(self.na, activation=\"linear\")(capa)\n        \n        return Model(inp, out)\n\n    def experience(self, s, a, r, s1, done=False):\n        if done:\n            s1 = None\n        self.buffer.append((s, a, r, s1))\n\n    def sample(self):\n        # CER\n        idx = np.hstack([np.random.choice(\n                            (len(self.buffer)-1),\n                            size=self.batch_size - 1,\n                            replace=False), (len(self.buffer)-1)])\n\n        return [self.buffer[i] for i in idx]\n\n    @staticmethod\n    def getQs(s, model):\n        return model(s).numpy()\n\n    def act(self, s):\n        if self.env.state_pos == 0:\n            return 1\n        if self._trainable:\n            if np.random.rand() > self.epsilon:\n                a = self._act_no_explore(s)\n            else:\n                a = self.env.action_sample()\n        else:\n            a = self._act_no_explore(s)\n        return a\n\n    def set_trainable(self, train=False):\n        self._trainable = train\n\n    def _act_no_explore(self, s):\n        q = self.getQs(\n            s=s.reshape(1, -1),\n            model=self.primary_model\n        )\n        return np.argmax(q)\n\n    def getQ(self, s, a):\n        return self.getQs(s)[:, a]\n\n    def switchmodels(self):\n        self.primary_model, self.target_model = (\n            self.target_model, self.primary_model\n        )\n\n    def learn(self):\n        batch = self.sample()\n        s_list = np.array([experience[0].tolist() for experience in batch])\n        s1_list = np.array([\n            experience[3].tolist() for experience in batch\n            if experience[3] is not None\n        ])\n        qs = self.getQs(s_list, self.primary_model)\n        qs1 = self.getQs(s1_list, self.primary_model)\n        q_target = self.getQs(s1_list, self.target_model)\n        targets = qs\n\n        k = 0\n        for i, (s, a, r, s1) in enumerate(batch):\n            if s1 is None:\n                targets[i][a] = r\n            else:\n                best_next_action = np.argmax(qs1[k])\n                targets[i][a] = r + self.gamma * q_target[k, best_next_action]\n                k += 1\n\n        loss = self.primary_model.train_on_batch(s_list, targets)\n\n        self.switchmodels()\n\n        return loss\n\n\nwith open(\"data/rep_data.pkl\", \"rb\") as f:\n    dict_ = pickle.load(f)\nrep_data, rep_data_val, rep_data_test = dict_[\"train\"], dict_[\"val\"], dict_[\"test\"]\n\nwith open(\"data/san_data.pkl\", \"rb\") as f:\n    dict_ = pickle.load(f)\nsan_data, san_data_val, san_data_test = dict_[\"train\"], dict_[\"val\"], dict_[\"test\"]\n\nfor label, data in [(\"REP\", (rep_data, rep_data_val, rep_data_test)), \n                    (\"SAN\", (san_data, san_data_val, san_data_test))]:\n    print(f\"{label} - Datos 'train' ({len(data[0].keys())})\\n{data[0].keys()}\")\n    print(f\"{label} - Datos 'val' ({len(data[1].keys())})\\n{data[1].keys()}\")\n    print(f\"{label} - Datos 'test' ({len(data[2].keys())})\\n{data[2].keys()}\\n\")\n\n\nclass BestExecutionEnv:\n    def __init__(self, data, look_back=60):\n        # Fixed params\n        self.data = data\n        self.look_back = look_back\n        self.episode_bins = None\n        self.episode_full_len = None\n        self.vol_care = None\n        \n        self.actions_fn = {\n            0: self._do_nothing,\n            1: self._agg_action,\n        }\n        \n        self.n_actions = len(self.actions_fn)\n        self.n_features = self._detect_num_feat()\n\n        # Data variables\n        self.episode = None\n        self.episode_full = None\n        \n        # Env variables\n        self.episode_vwap = None\n        self.market_ep_vol = None\n        self.state_pos = 0\n        self.exec_vol = 0\n        self.actions_hist = []\n        self.algo_vwap_hist = []\n        self.market_vwap_hist = []\n        self.reward_hist = []\n        self.price_hist = []\n        self.vol_hist = []\n        \n    def _detect_num_feat(self):\n        self._reset_env_episode_params()\n        self._generate_episode()\n        s = self.observation_builder()\n        return s.shape[0]\n        \n    def _reset_env_episode_params(self):\n        self.episode_full_len = None\n        self.episode = None\n        self.episode_full = None\n        self.episode_vwap = None\n        self.market_ep_vol = None\n        self.state_pos = 0\n        self.algo_vwap = 0\n        self.exec_vol = 0\n        self.actions_hist = []\n        self.algo_vwap_hist = []\n        self.market_vwap_hist = []\n        self.reward_hist = []\n        self.price_hist = []\n        self.vol_hist = []\n        \n    def _generate_episode_params(self):\n        self.episode_bins = np.random.randint(low=400, high=600)\n        pct_bins = np.random.uniform(low=0.075, high=0.125)\n        self.vol_care = int(pct_bins * self.episode_bins)\n        \n        self.episode_full_len = self.episode_bins + self.look_back\n        \n        assert self.episode_bins <= 600\n        assert self.episode_bins >= 400\n        assert self.vol_care <= int(self.episode_bins * 0.125)\n        assert self.vol_care >= int(self.episode_bins * 0.075)\n        assert isinstance(self.vol_care, int)\n        \n    def _generate_episode(self):\n        self._generate_episode_params()\n        \n        lenght_episode = 0\n        while lenght_episode != self.episode_full_len:\n            selected_day = np.random.choice(\n                    list(self.data.keys()\n                )\n            )\n            data_day = self.data[selected_day]\n            \n            init_time = np.random.choice(data_day.index)\n            \n            hour_pos = data_day.index.get_loc(init_time)\n            initial_position = hour_pos - self.look_back\n            final_position = hour_pos + self.episode_bins\n            \n            if initial_position < 0:\n                continue\n            else:\n                self.episode_full = data_day.iloc[initial_position:final_position, :]\n                \n                self.episode = data_day.iloc[hour_pos:final_position, :]\n                \n                lenght_episode = self.episode_full.shape[0]\n        \n            \n    def reset(self) -> np.array:\n        self._reset_env_episode_params()     \n        self._generate_episode()\n        self._compute_episode_market_feat()\n        \n        return self.observation_builder()\n    \n    def observation_builder(self) -> np.array:\n        time_left = (self.episode_bins - self.state_pos) / self.episode_bins\n        vol_left = 1 - (self.exec_vol / self.vol_care)\n        obs  = np.array([time_left, vol_left])\n        \n        return obs\n    \n    def _compute_episode_market_feat(self) -> Tuple[float, float]:\n        mid = (self.episode[\"ask1\"] + self.episode[\"bid1\"]) / 2\n        self.market_ep_vol = self.episode.cumvol.diff()\n        self.market_ep_vol[0] = 0\n        cum_vol = self.market_ep_vol.sum()\n        self.episode_vwap = (mid[:-1] * self.market_ep_vol[1:]).sum() / cum_vol\n        \n        return self.episode_vwap, self.market_ep_vol\n    \n    def _compute_algo_vwap(self) -> float:\n        p_arr = np.array(self.price_hist)\n        v_arr = np.array(self.vol_hist)\n        algo_vwap = np.sum(p_arr * v_arr) / np.sum(v_arr)\n        return algo_vwap\n    \n    \n    def _compute_reward(self, price: float, vol: float) -> float:\n        if vol == 0:\n            reward = 0\n            return reward\n        reward = (self.episode_vwap - price) / self.episode_vwap\n        return reward\n    \n    def _compute_stop_conditions(self) -> Tuple[bool, bool]:\n        \"\"\"Define las condiciones de parada del episodio\n        Return:\n            Tiempo agotado, orden completada\n        \"\"\"\n        # TODO: Calcula las variables de parada y devuélvelas en el orden apropiado\n        is_bins_complete = self.state_pos == self.episode_bins\n        is_ord_complete = self.exec_vol == self.vol_care\n        return is_bins_complete, is_ord_complete\n    \n    def _compute_done_reward(self) -> float:\n        # TODO: Free style\n        _, is_ord_complete = self._compute_stop_conditions()\n        rwd_factor = not is_ord_complete\n        done_reward = -1 * rwd_factor\n        return done_reward\n    \n    def _agg_action(self) -> float:\n        price = self.episode[\"ask1\"].values[self.state_pos]\n        self.price_hist.append(price)\n        exec_vol = 1\n        self.exec_vol += exec_vol\n        self.vol_hist.append(exec_vol)\n        \n        algo_vwap = self._compute_algo_vwap()\n        # guarda el algo_vwap en algo_vwap_hist\n        self.algo_vwap_hist.append(algo_vwap)\n        reward = self._compute_reward(price, exec_vol)\n        return reward\n\n    def _do_nothing(self) -> float:\n        price = 0\n        exec_vol = 0\n        self.price_hist.append(price)\n        self.vol_hist.append(exec_vol)\n        algo_vwap = self.algo_vwap_hist[-1]\n        self.algo_vwap_hist.append(algo_vwap)\n        reward = self._compute_reward(price, exec_vol)\n        \n        return reward\n\n    def _compute_market_vwap(self) -> float:\n        mid_p = (self.episode[\"ask1\"] + self.episode[\"bid1\"]) / 2\n        mkt_p = (mid_p + mid_p.shift(-1).ffill()) / 2\n        # Calcula todos los vwap del mkt hasta el step actual incluido\n        v = self.episode[\"cumvol\"].diff().shift(-1)\n        p_arr = mkt_p.values[:self.state_pos + 1]\n        v_arr = v.values[:self.state_pos + 1]\n        sum_vol = np.sum(v_arr)\n        # Si el mkt vol hasta el step == 0, devuelve el último precio hasta el step\n        if sum_vol == 0:\n            return p_arr[-1]\n        # Calcula y devuelve el vwap acumulado hasta el step\n        market_vwap = np.sum(p_arr * v_arr) / sum_vol\n        return market_vwap\n    \n    def _compute_done(self) -> bool:\n        conditions = self._compute_stop_conditions()\n        is_bins_complete = conditions[0]\n        is_ord_complete = conditions[1]\n        # TODO: Devuelve done == True si se cumplen cualquiera de las condiciones\n        done = is_bins_complete or is_ord_complete\n        return done\n\n    def step(self, action) -> Tuple[np.array, float, bool, dict]:\n        \"\"\" Evalua la acción, calcula la recompensa, devuelve el \n        nuevo estado y si el episodio ha terminado.\n        \"\"\"\n        \n        market_vwap = self._compute_market_vwap()\n        act_fn = self.actions_fn.get(action)\n        if act_fn is None:\n            raise ValueError(\n                f\"Invalid action {action}. Valid actions {self.actions_fn.keys()}\"\n            )\n        \n        reward = act_fn()\n\n        self.market_vwap_hist.append(market_vwap)\n        self.reward_hist.append(reward)\n        \n        self.state_pos += 1\n        \n        done = self._compute_done()\n        \n        if done:\n            reward += self._compute_done_reward()\n            return None, reward, done, {}\n        \n        observation = self.observation_builder()\n        \n        return np.array(observation), reward, done, {}\n    \n    def action_sample(self) -> int:\n        p = self.vol_care / self.episode.shape[0]\n        action = np.random.choice([0, 1], p=[1-p, p])\n        return action\n\n    def stats_df(self):\n        \"\"\"Información para el gráfico de resultados de la ejecución\n        \"\"\"\n        \n        my_df = pd.DataFrame(\n            {\"vwap\": self.algo_vwap_hist, \"vol\": self.vol_hist},\n            index=list(self.episode.index)[:len(self.algo_vwap_hist)]\n        )\n        my_df = my_df.reindex(self.episode.index)\n        my_df[\"vol\"] = my_df[\"vol\"].fillna(0)\n        my_df[\"vwap\"] = my_df[\"vwap\"].ffill()\n            \n        \n        p = self.episode[\"ask1\"]\n        v = self.episode[\"cumvol\"].diff().shift(-1)\n        last_v = self.episode_full[\"cumvol\"].diff()[-1]\n        v.iloc[-1] = last_v\n        market_vwap = (p * v).cumsum() / v.cumsum()\n        market_df = pd.DataFrame(\n            {\"vwap\": market_vwap, \"vol\": v},\n            index=v.index\n        )\n        \n        mpx = (self.episode[\"ask1\"] + self.episode[\"bid1\"]) / 2\n        \n        return my_df, market_df, mpx \n\n\nclass TWAP(DDQNAgent):\n    def act(self, s): # [T, V]\n        # TODO: Configura un TWAP determinista utilizando s\n        if s[1] >= s[0]:\n            return 1\n        return 0\n\n\n\"\"\"\n    Agent Params\n\"\"\"\nepsilon = 1\nmin_epsilon = 0.05\ngamma = 1\nalpha = 0.0001\nbuffer_size = 40000\nbatch_size = 256\nhidden_neurons = 240\n\n\"\"\"\n    Training Params\n\"\"\"\nnepisodes = 200\nn_log = 25\nepsilon_decay = (epsilon - min_epsilon) / (nepisodes * 0.95)\nlearn_after = batch_size\n\nenv = BestExecutionEnv(rep_data, 60)\n\nagent = DDQNAgent(\n    env, gamma=gamma, epsilon=epsilon, alpha=alpha,\n    batch_size=batch_size, buffer_size=buffer_size,\n    hidden_neurons=hidden_neurons, trainable=True\n)\n\n# Before starting the exploration loop\nprint(\"Before starting the exploration loop:\")\nprint(f\"Initial Observation: {s}\")\nprint(f\"Initial Epsilon: {agent.epsilon}\")\nprint(f\"Environment initial state position: {env.state_pos}\")\nprint(f\"Initial Volume Executed: {env.exec_vol}\")\n\n# Exploration Loop\ns = env.reset()\nfor exps in range(buffer_size):  \n    a = agent.act(s)\n    s1, r, done, _ = env.step(a)\n    agent.experience(s, a, r, s1, done)\n    s = s1\n\n    if not exps % 10000:\n        print(f'buffer exps: {exps}')\n\n    if done:\n        s = env.reset()\n\n# After finishing the exploration loop\nprint(\"After finishing the exploration loop:\")\nprint(f\"Final Observation: {s}\")\nprint(f\"Final State Position: {env.state_pos}\")\nprint(f\"Final Volume Executed: {env.exec_vol}\")\nprint(f\"Buffer Size: {len(agent.buffer)}\")\n\nagent.set_trainable(True)\nlearn_counter = 0\nhistory_steps = []\nhistory_rewards = []\nhistory_disc_rewards = []\nhistory_losses = []\n\nlist_df = []\nlist_market_df = []\nlist_mpx = []\n\nfor episode in range(nepisodes):\n    s = env.reset()\n    step = 0\n    cum_reward = 0\n    dis_cum_reward = 0\n    episode_losses = []\n    while True:\n        a = agent.act(s)\n        s1, r, done, _ = env.step(a)\n        agent.experience(s, a, r, s1, done)\n        learn_counter += 1\n        cum_reward += r\n        dis_cum_reward += agent.gamma ** step * r\n        s = s1\n        step += 1\n        if not learn_counter % learn_after:\n            mse = agent.learn()\n        if done:\n            agent.epsilon = max([agent.epsilon - epsilon_decay, min_epsilon])\n            history_rewards.append(cum_reward)\n            history_disc_rewards.append(dis_cum_reward)\n            history_losses.append(mse)\n            history_steps.append(step)\n            \n            res = env.stats_df()\n            list_df.append(res[0])\n            list_market_df.append(res[1])\n            list_mpx.append(res[2])\n            \n            if not episode % n_log:\n                mse = agent.learn()\n                print(\n                    f'Episode: {episode}, '\n                    f'steps: {np.round(np.mean(history_steps[-n_log:]), 2)}, '\n                    f'rew: {np.round(np.mean(history_rewards[-n_log:]), 2)}, '\n                    f'mse: {np.round(mse)}, '\n                    f'eps: {np.round(agent.epsilon, 2)}'\n                )\n            break\n\n\nagent.set_trainable(False)\ncum_reward = 0\nstep = 0\nenv = BestExecutionEnv(dict_[\"test\"], 60)\ns = env.reset()\na = 1\ns, r, done, _ = env.step(a)\nstep += 1\ncum_reward += agent.gamma ** step * r\nwhile True:\n    a = agent.act(s)\n    s, r, done, _ = env.step(a)\n    step += 1\n    cum_reward += agent.gamma ** step * r\n    if done:\n        break\nplot_results(env)\n\npd.DataFrame(history_rewards).rolling(20).mean().plot()"
    }
  ]
}

🤖 The additional code has been added to the canvas, extending the training and evaluation process of the DDQN agent. Let me know if further modifications are required or if there’s anything else to work on.

😎 explain in few words what is the theoretical meaning of the variable state_pos

🤖 The variable `state_pos` represents the current position or time step within the episode. It tracks how far the agent has progressed through the trading episode, effectively acting as a counter for the number of actions taken. This value is used to determine when an episode is complete and is incremented after each action is performed by the agent.

😎 This is a trading execution algorithm. You just described machine code. The request is to describe the theoretical meaning. Is it money, is it time…? WHATEVER IT IS, READ THE CODE, UNDERSTAND IT, AND SAY WHAT IT IS

[Message flagged as bad_response]
🤖 In this trading execution algorithm, the variable `state_pos` represents the current time step or the progress within the trading episode. Theoretically, it tracks the passage of time during the execution of a trading order, essentially indicating how much time has elapsed since the start of the episode in terms of discrete trading periods or decision points. It helps determine how much time remains for executing the order (`time_left`) and thus influences the agent’s decision-making during the episode.

Hugging Face Dataset Metrics

All the conversation prompts, responses, and metrics are available to download and explore on Hugging Face dataset reddgr/talking-to-chatbots-chats: