TradeMaster-NTU · Gikiman · Feb 19, 2023 · Feb 19, 2023 · Feb 20, 2023 · Feb 20, 2023
diff --git a/agent/SARL/encoder/util.py b/agent/SARL/encoder/util.py
@@ -10,30 +10,20 @@ def prepart_m_lstm_data(df, num_day, technical_indicator):
     tic_list = df.tic.unique()
     df_list = []
     label_list = []
-    for index in df.index.unique()[num_day:]:
-        dfs = []
+    for tic in tqdm(tic_list):
+
         labels = []
-        df_date = df[[
-            True if i in range(index - num_day, index) else False
-            for i in df.index
-        ]]
-        for tic in tic_list:
-            df_tic = df_date[df_date.tic == tic]
-            np_tic = df_tic[technical_indicator].to_numpy()
-            # print(np_tic.shape)
-            dfs.append(np_tic)
-            old_price = float(df_tic[df_tic.index == index - 1].close)
-            new_price = float(df[(df.index == index) * (df.tic == tic)].close)
-            if new_price > old_price:
-                label = 1
-            else:
-                label = 0
-            labels.append(label)
+        df_tic = df[df.tic == tic]
+        dfs = df_tic[technical_indicator].to_numpy()
+        old_prices = df_tic[num_day-1:len(df_tic)-1]['close'].astype(float).values
+        new_prices = df_tic[num_day:len(df_tic)]['close'].astype(float).values
+
+        labels = (new_prices > old_prices).astype(int)
+        label_list.append(np.expand_dims(labels, axis=1))
         df_list.append(dfs)
-        label_list.append(labels)
-    label_list = np.array(label_list)
-    df_list = np.array(df_list)
-    return label_list, df_list
+    label_list = np.concatenate(label_list, axis=1) 
+    df_list = np.array(df_list) 
+    return label_list, df_list 
 
 
 def prepart_lstm_data(df, num_day, technical_indicator):
@@ -119,18 +109,18 @@ def dict_to_args(**kwargs):
     args = parser.parse_args()
     return args
 
-
 class m_lstm_dataset(Dataset):
-    def __init__(self, df_list, label_list):
+    def __init__(self, df_list, label_list,num_day):
         self.df = df_list
         self.label = label_list
+        self.num_day = num_day
         self.X = torch.from_numpy(self.df).float()
         self.y = torch.from_numpy(self.label).float()
 
     def __len__(self):
-        return self.df.shape[0]
+        return self.label.shape[0]
 
     def __getitem__(self, idx):
-        X = self.X[idx, :, :, :]
+        X = self.X[:, idx:idx+self.num_day, :]
         y = self.y[idx, :]
         return X, y
diff --git a/docs/source/introduction.md b/docs/source/introduction.md
@@ -16,9 +16,8 @@ Architecture of Trademaster framework could be visualizaed by the figure below.
 
   TradeMaster is evaluated in multiple dimenstions. Financial metrics like profit and risk metrics are applied. Additionally, decision tree and shapley value are used to evaluate the explainability of the model. Variability and Alpha decay are used for reliability evaluation.
 
-<div align="center">
-<img align="center" src=../../figure/Architecture.jpg width="70%"/>
-</div>
+![Architecture.jpg](../../figure/Architecture.jpg)
+
 
 ## Supported Trading Scenario
 

diff --git a/docs/source/script/yahoo.md b/docs/source/script/yahoo.md
@@ -1,3 +1,14 @@
 # Download Data from Yahoo Finance
+In order to build up your own dataset, Yahoo Finance is an open-source platform where you can get access to various types of financial market data such as US stock, forex and cryptocurrency via Yahoo Finance python API(yfinance). 
 
+Here is an example of script downloading Apple's stock data from yfinance, which contains the open, high, low, close, adjusted close price and volume. 
+
+   ```
+   import yfinance as yf
+   start_date='2009-01-02'
+   end_date='2021-01-01'
+   df = yf.download('AAPL', start=start_date, end=end_date, interval='1d')
+   ```
+By modifying the instructions, you can customize your downloaded dataset. 
+
 
diff --git a/docs/source/tool/csdi.md b/docs/source/tool/csdi.md
@@ -1,3 +1,9 @@
 # Missing Value Imputation with CSDI
+Most of the raw data retrieved from different data sources consist of missing values (NaN values), and the most common method of dealing with missing values is directly dropping them. However, we provide an alternative solution by using the imputation model proposed in the following paper. 
 
+[CSDI: Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation (Yusuke Tashiro, etc.)](https://arxiv.org/abs/2107.03502) *NeurIPS 2021*
+
+CSDI is a diffusion model which generates missing values in raw data by diffusion process using observed values as conditional input. The model is trained by optimizing an unsupervised task: recovery of a certain ratio of masked observed data by using the rest observed data as conditional input. When performing real imputation on datasets, all missing values are imputation targets and all observed values serve as conditional input. Please refer to the original paper if you have any enquiries about the methodology. 
+
+We implement the model into a ready-to-use toolbox for missing value imputation of financial data. Please refer to [CSDI for financial data imputation](https://github.com/ZONG0004/TradeMaster/blob/main/data/CSDI/README.md) for detailed guideline of usage and visualization results. 
 
diff --git a/docs/source/tool/example_figs/FInal_compass.png b/docs/source/tool/example_figs/FInal_compass.png