Add files via upload

nyuolab · Mar 30, 2021 · b36f47e · b36f47e
commit b36f47e
Show file tree

Hide file tree

Showing 15 changed files with 1,580 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# DeepMatch - Simulation
+
+Repository containing code for performing DeepMatch experiments on synthetic data. 
diff --git a/data/data_loader.py b/data/data_loader.py
@@ -0,0 +1,81 @@
+import numpy as np
+import h5py
+import tables
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+ALLOWED_STATES = ['TRAIN', 'TEST', 'VAL']
+
+class SimulationDataset(torch.utils.data.Dataset):
+    def __init__(self, data_fn, state, **kwargs):
+        """ Manages loading of NIS database samples in a H5 file, particularly for training.
+        """
+        # super(NISDatabase, self).__init__()
+        self.filename = data_fn
+        self.dataset = np.load(data_fn)
+        self.dataset_key = kwargs.get('dataset_key', 'dataset')
+
+        self.state_inds = kwargs.get('state_inds', {})
+        self.allowed_states = kwargs.get('allowed_states', [])
+        self.pin_memory = kwargs.get('pin_memory', True)
+        self.change_state(state)
+
+        self.batch_size = kwargs.get('batch_size', 1000)
+        self.iterator = DataLoader(self, batch_size=self.batch_size, pin_memory=self.pin_memory, num_workers=0)
+
+    def __getitem__(self, index):
+        """ Load input data.
+
+        Note that index really indexes into inds for training, test, val split.
+
+        Also, loading data in worker processes and opening the HDF5 file *once*
+        in __get_item__ ensures multithreading from the DataLoader class
+        works appropriately. See
+        https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/16
+        for full discussion.
+
+        @TODO consider using torch.nn.data.Sampler instead of code below.
+
+        """
+
+        state_index = self.inds[index]
+        return self.dataset[state_index]
+
+    def __len__(self):
+        """ Return length of dataset we're operating on.
+        """
+        return self.dataset_len
+
+    def set_inds(self, inds):
+        self.dataset_len = inds.shape[0]
+        self.inds = inds
+
+    def set_dataset_key(self, key):
+        self.dataset_key = key
+        self.dataset = None
+
+    def set_batch_size(self, batch_size):
+        self.batch_size = batch_size
+        self.iterator = DataLoader(self, batch_size=batch_size, pin_memory=self.pin_memory, num_workers=0)
+
+    def change_state(self, state):
+        """ Change attributes based on state of model.
+        """
+
+        if len(list(self.state_inds.keys())) == 0:
+            # If we want the full dataset, this is how we specify it. 
+            self.state_inds = {
+                'full' : np.arange(self.dataset.shape[0])
+                }
+            self.allowed_states = ['full', ]
+
+        else:
+            # If we've already declared unique states, then just move on.
+            if type(self.state_inds) == type({}):
+                self.allowed_states = list(self.state_inds.keys())
+
+        if state not in self.allowed_states:
+            raise ValueError(f'{state} not found.')
+
+        self.set_inds(self.state_inds[state])
diff --git a/data/datagen.py b/data/datagen.py
@@ -0,0 +1,61 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.datasets import make_regression
+
+
+def generate_dataset_linear(n_samples=500, set_X=None):
+    """
+    Generate samples from two correlated distributions and allow for setting a specific
+    value for X to simulate a RCT where patient's are assigned to experimental vs control groups.
+    
+    X is a bernoulli variable sampled from binomial distribution.
+    Z is a confounder sampled from a uniform distribution.
+    Y is a continuous real number. 
+
+    The probabilistic model for this is:
+    X --> Y
+    Z --> X
+    Z --> Y
+    
+    Args:
+        n_samples (int): no. samples to generate
+        set_X (arr): numpy array to set_X to a specific interventoin to simulate a RCT
+                
+    Returns:
+        samples (pandas.DataFarme): a pandas dataframe of sampled data in the form [x, y, z]
+    
+    """
+
+    z = np.random.uniform(size=n_samples)
+
+    if set_X is not None:
+        assert(len(set_X) == n_samples)
+        x = set_X
+    else:
+        p_x = np.minimum(np.maximum(z,0.1), 0.9)
+        x = np.random.binomial(n=1, p=p_x, size=n_samples)
+
+    y0 = 2 * z
+    y1 = y0 - 0.5
+
+    y = np.where(x == 0, y0, y1) + 0.3 * np.random.normal(size=n_samples)
+
+    return pd.DataFrame({"x":x, "y":y, "z":z})
+
+
+def generate_dataset_regression(n_features, rank, noise=0):
+    """
+    Generate a dataset of 100,000 samples with n_features features and of rank 'rank'.
+    
+    Args:
+        n_features (int): no. of features to generate
+        rank (int): rank of dataset
+        noise (float): noise to add to the 
+        
+    Returns:
+        dataset (pandas.Dataframe): a pandas Dataframe of regression data.
+    
+    """
+
+    return make_regression(n_samples=100000, n_features=n_features, rank=rank, noise=noise)
diff --git a/data/helpers.py b/data/helpers.py
@@ -0,0 +1,61 @@
+import numpy as np
+import pandas as pd
+
+def logistic_fx(x):
+    """
+    Simplified logistic function from ML where L = 1 and k = 1, and x_0 = 0. 
+    Maps a real number from (-inf, inf) to (0,1) centered on x=0.
+    
+    Args:
+        x (float): input
+    Returns:
+        y (float): output
+    """
+
+    y = 1 / (1 + np.exp(-x))
+
+    return y
+
+
+def estimate_effect(df):
+    """
+    Estimated effect is the difference in response due to a treatment/intervention as measured in a 
+    RCT or A/B test. We can simply calculate this as a difference in means with a
+    95% CI between the responses of our control and experimental groups. 
+    
+    Args:
+        df (pandas.DataFrame): a dataframe of samples.
+        
+    Returns:
+        estimated_effect (dict[Str: float]): dictionary containing the difference in means for the
+            treated and untreated samples and the "standard_error" - 90% confidence intervals arround "estimated_effect"    
+    """
+
+    base = df[df.x == 0]
+    variant = df[df.x == 1]
+    delta = variant.y.mean() - base.y.mean()
+    delta_err = 1.96 * np.sqrt(variant.y.var() / variant.shape[0] + base.y.var() / base.shape[0])
+
+    return {"estimated_effect": delta, "standard_error": delta_err}
+
+
+def run_ab_test(datagenerator, n_samples=10000):
+    """
+    Generates n_samples from a datagenerator with the value of X randomized
+    so that 50% of the samples recieve treatment X=1 and 50% receive X=0,
+    and feeds the results into `estimate_effect` to get an unbiased 
+    estimate of the average treatment effect.
+    
+    Args:
+        datagenerator (method): a datagenerator method from datagen
+        n_samples (int): an integer describing number of samples to draw
+    Returns:
+        estimated_effect (dict[Str: float]): See estimate_effect for details
+    """
+    n_samples_a = int(n_samples / 2)
+    n_samples_b = n_samples - n_samples_a
+
+    set_X = np.concatenate([np.ones(n_samples_a), np.zeros(n_samples_b)]).astype(np.int64)
+    ds = datagenerator(n_samples=n_samples, set_X=set_X)
+
+    return estimate_effect(ds)
diff --git a/experiments/train_sim_01.py b/experiments/train_sim_01.py
@@ -0,0 +1,75 @@
+import os
+os.chdir('/home/aisinai/work/repos/deepmatch-simulation')
+
+import tables
+import numpy as np
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+from model.autoencoder import AutoEncoder
+from trainer.trainer import Trainer
+
+# Create Global Variables
+DATA_FOLDER = 'data/'
+
+DEFAULT_BUILD = {
+    'data' : {
+        'input_features' : 500
+    },
+
+    'embedding' : {
+        'dims' : 50
+    },
+
+    'encoding' : {
+        'total_layers' : 1,
+        'scale' : 4,
+        'activation' : 'leaky_relu',
+    },
+
+    'latent' : {'dimensions' : 64},
+
+    'decoding' : {
+        'scale' : 4,
+        'activation' : 'leaky_relu',
+        'total_layers' : 1,
+        'output_dims' : None
+    }
+}
+
+BATCH_SIZE = 512
+NUM_WORKERS = 4
+LEARNING_RATE = 5e-4
+NUM_EPOCHS = 1000000
+
+SAVE_PATH = '{0}/'.format(__file__.split('.')[0]) # remove.py and create a folder
+print("Saving at {0}.".format(SAVE_PATH))
+
+# DEVICE = torch.device('cuda')
+DEVICE = torch.device('cuda:0')
+
+"""
+"""
+
+# Main Script
+def train():
+    # Create the autoencoder.
+    ae = AutoEncoder(DEFAULT_BUILD).to(DEVICE)
+
+    # Instantiate the data loader.
+    db = np.load(DATA_FOLDER + 'simulation_data_X.npy')
+    db = db[:80000, :] # Training
+
+    db_dl = DataLoader(db, batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS, shuffle=True)
+
+    # Instantiate the loss function.
+    loss_function = nn.MSELoss().to(DEVICE)
+
+    # Create our trainer.
+    trainer = Trainer(ae, loss_function, LEARNING_RATE, db_dl, DEVICE)
+    trainer.train(NUM_EPOCHS, SAVE_PATH)
+
+if __name__ == '__main__':
+    train()
diff --git a/experiments/train_sim_02.py b/experiments/train_sim_02.py
@@ -0,0 +1,71 @@
+import os
+os.chdir('/home/aisinai/work/repos/deepmatch-simulation')
+
+import tables
+import numpy as np
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+from model.autoencoder import AutoEncoder
+from trainer.trainer import Trainer
+
+# Create Global Variables
+DATA_FOLDER = 'data/'
+
+DEFAULT_BUILD = {
+    'data' : {
+        'input_features' : 500
+    },
+
+    'encoding' : {
+        'total_layers' : 1,
+        'scale' : 4,
+        'activation' : 'leaky_relu',
+    },
+
+    'latent' : {'dimensions' : 32},
+
+    'decoding' : {
+        'scale' : 4,
+        'activation' : 'leaky_relu',
+        'total_layers' : 1,
+        'output_dims' : None
+    }
+}
+
+BATCH_SIZE = 512
+NUM_WORKERS = 4
+LEARNING_RATE = 1e-3
+NUM_EPOCHS = 100000
+
+SAVE_PATH = '{0}/'.format(__file__.split('.')[0]) # remove.py and create a folder
+print("Saving at {0}.".format(SAVE_PATH))
+
+# DEVICE = torch.device('cuda')
+DEVICE = torch.device('cuda:0')
+
+"""
+"""
+
+# Main Script
+def train():
+    # Create the autoencoder.
+    ae = AutoEncoder(DEFAULT_BUILD).to(DEVICE)
+
+    # Instantiate the data loader.
+    db = np.load(DATA_FOLDER + 'simulation_data_X.npy')
+    db = db[:80000, :] # Training
+
+    db_dl = DataLoader(db, batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS, shuffle=True)
+
+    # Instantiate the loss function.
+    loss_function = nn.MSELoss().to(DEVICE)
+
+    # Create our trainer.
+    trainer = Trainer(ae, loss_function, LEARNING_RATE, db_dl, DEVICE)
+    trainer.train(NUM_EPOCHS, SAVE_PATH)
+
+if __name__ == '__main__':
+    train()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# DeepMatch - Simulation

		Repository containing code for performing DeepMatch experiments on synthetic data.