Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
RespectableGlioma authored Mar 30, 2021
0 parents commit b36f47e
Show file tree
Hide file tree
Showing 15 changed files with 1,580 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# DeepMatch - Simulation

Repository containing code for performing DeepMatch experiments on synthetic data.
81 changes: 81 additions & 0 deletions data/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import numpy as np
import h5py
import tables
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

ALLOWED_STATES = ['TRAIN', 'TEST', 'VAL']

class SimulationDataset(torch.utils.data.Dataset):
def __init__(self, data_fn, state, **kwargs):
""" Manages loading of NIS database samples in a H5 file, particularly for training.
"""
# super(NISDatabase, self).__init__()
self.filename = data_fn
self.dataset = np.load(data_fn)
self.dataset_key = kwargs.get('dataset_key', 'dataset')

self.state_inds = kwargs.get('state_inds', {})
self.allowed_states = kwargs.get('allowed_states', [])
self.pin_memory = kwargs.get('pin_memory', True)
self.change_state(state)

self.batch_size = kwargs.get('batch_size', 1000)
self.iterator = DataLoader(self, batch_size=self.batch_size, pin_memory=self.pin_memory, num_workers=0)

def __getitem__(self, index):
""" Load input data.
Note that index really indexes into inds for training, test, val split.
Also, loading data in worker processes and opening the HDF5 file *once*
in __get_item__ ensures multithreading from the DataLoader class
works appropriately. See
https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/16
for full discussion.
@TODO consider using torch.nn.data.Sampler instead of code below.
"""

state_index = self.inds[index]
return self.dataset[state_index]

def __len__(self):
""" Return length of dataset we're operating on.
"""
return self.dataset_len

def set_inds(self, inds):
self.dataset_len = inds.shape[0]
self.inds = inds

def set_dataset_key(self, key):
self.dataset_key = key
self.dataset = None

def set_batch_size(self, batch_size):
self.batch_size = batch_size
self.iterator = DataLoader(self, batch_size=batch_size, pin_memory=self.pin_memory, num_workers=0)

def change_state(self, state):
""" Change attributes based on state of model.
"""

if len(list(self.state_inds.keys())) == 0:
# If we want the full dataset, this is how we specify it.
self.state_inds = {
'full' : np.arange(self.dataset.shape[0])
}
self.allowed_states = ['full', ]

else:
# If we've already declared unique states, then just move on.
if type(self.state_inds) == type({}):
self.allowed_states = list(self.state_inds.keys())

if state not in self.allowed_states:
raise ValueError(f'{state} not found.')

self.set_inds(self.state_inds[state])
61 changes: 61 additions & 0 deletions data/datagen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import numpy as np
import pandas as pd

from sklearn.datasets import make_regression


def generate_dataset_linear(n_samples=500, set_X=None):
"""
Generate samples from two correlated distributions and allow for setting a specific
value for X to simulate a RCT where patient's are assigned to experimental vs control groups.
X is a bernoulli variable sampled from binomial distribution.
Z is a confounder sampled from a uniform distribution.
Y is a continuous real number.
The probabilistic model for this is:
X --> Y
Z --> X
Z --> Y
Args:
n_samples (int): no. samples to generate
set_X (arr): numpy array to set_X to a specific interventoin to simulate a RCT
Returns:
samples (pandas.DataFarme): a pandas dataframe of sampled data in the form [x, y, z]
"""

z = np.random.uniform(size=n_samples)

if set_X is not None:
assert(len(set_X) == n_samples)
x = set_X
else:
p_x = np.minimum(np.maximum(z,0.1), 0.9)
x = np.random.binomial(n=1, p=p_x, size=n_samples)

y0 = 2 * z
y1 = y0 - 0.5

y = np.where(x == 0, y0, y1) + 0.3 * np.random.normal(size=n_samples)

return pd.DataFrame({"x":x, "y":y, "z":z})


def generate_dataset_regression(n_features, rank, noise=0):
"""
Generate a dataset of 100,000 samples with n_features features and of rank 'rank'.
Args:
n_features (int): no. of features to generate
rank (int): rank of dataset
noise (float): noise to add to the
Returns:
dataset (pandas.Dataframe): a pandas Dataframe of regression data.
"""

return make_regression(n_samples=100000, n_features=n_features, rank=rank, noise=noise)
61 changes: 61 additions & 0 deletions data/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import numpy as np
import pandas as pd

def logistic_fx(x):
"""
Simplified logistic function from ML where L = 1 and k = 1, and x_0 = 0.
Maps a real number from (-inf, inf) to (0,1) centered on x=0.
Args:
x (float): input
Returns:
y (float): output
"""

y = 1 / (1 + np.exp(-x))

return y


def estimate_effect(df):
"""
Estimated effect is the difference in response due to a treatment/intervention as measured in a
RCT or A/B test. We can simply calculate this as a difference in means with a
95% CI between the responses of our control and experimental groups.
Args:
df (pandas.DataFrame): a dataframe of samples.
Returns:
estimated_effect (dict[Str: float]): dictionary containing the difference in means for the
treated and untreated samples and the "standard_error" - 90% confidence intervals arround "estimated_effect"
"""

base = df[df.x == 0]
variant = df[df.x == 1]
delta = variant.y.mean() - base.y.mean()
delta_err = 1.96 * np.sqrt(variant.y.var() / variant.shape[0] + base.y.var() / base.shape[0])

return {"estimated_effect": delta, "standard_error": delta_err}


def run_ab_test(datagenerator, n_samples=10000):
"""
Generates n_samples from a datagenerator with the value of X randomized
so that 50% of the samples recieve treatment X=1 and 50% receive X=0,
and feeds the results into `estimate_effect` to get an unbiased
estimate of the average treatment effect.
Args:
datagenerator (method): a datagenerator method from datagen
n_samples (int): an integer describing number of samples to draw
Returns:
estimated_effect (dict[Str: float]): See estimate_effect for details
"""
n_samples_a = int(n_samples / 2)
n_samples_b = n_samples - n_samples_a

set_X = np.concatenate([np.ones(n_samples_a), np.zeros(n_samples_b)]).astype(np.int64)
ds = datagenerator(n_samples=n_samples, set_X=set_X)

return estimate_effect(ds)
75 changes: 75 additions & 0 deletions experiments/train_sim_01.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os
os.chdir('/home/aisinai/work/repos/deepmatch-simulation')

import tables
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from model.autoencoder import AutoEncoder
from trainer.trainer import Trainer

# Create Global Variables
DATA_FOLDER = 'data/'

DEFAULT_BUILD = {
'data' : {
'input_features' : 500
},

'embedding' : {
'dims' : 50
},

'encoding' : {
'total_layers' : 1,
'scale' : 4,
'activation' : 'leaky_relu',
},

'latent' : {'dimensions' : 64},

'decoding' : {
'scale' : 4,
'activation' : 'leaky_relu',
'total_layers' : 1,
'output_dims' : None
}
}

BATCH_SIZE = 512
NUM_WORKERS = 4
LEARNING_RATE = 5e-4
NUM_EPOCHS = 1000000

SAVE_PATH = '{0}/'.format(__file__.split('.')[0]) # remove.py and create a folder
print("Saving at {0}.".format(SAVE_PATH))

# DEVICE = torch.device('cuda')
DEVICE = torch.device('cuda:0')

"""
"""

# Main Script
def train():
# Create the autoencoder.
ae = AutoEncoder(DEFAULT_BUILD).to(DEVICE)

# Instantiate the data loader.
db = np.load(DATA_FOLDER + 'simulation_data_X.npy')
db = db[:80000, :] # Training

db_dl = DataLoader(db, batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS, shuffle=True)

# Instantiate the loss function.
loss_function = nn.MSELoss().to(DEVICE)

# Create our trainer.
trainer = Trainer(ae, loss_function, LEARNING_RATE, db_dl, DEVICE)
trainer.train(NUM_EPOCHS, SAVE_PATH)

if __name__ == '__main__':
train()
71 changes: 71 additions & 0 deletions experiments/train_sim_02.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
os.chdir('/home/aisinai/work/repos/deepmatch-simulation')

import tables
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from model.autoencoder import AutoEncoder
from trainer.trainer import Trainer

# Create Global Variables
DATA_FOLDER = 'data/'

DEFAULT_BUILD = {
'data' : {
'input_features' : 500
},

'encoding' : {
'total_layers' : 1,
'scale' : 4,
'activation' : 'leaky_relu',
},

'latent' : {'dimensions' : 32},

'decoding' : {
'scale' : 4,
'activation' : 'leaky_relu',
'total_layers' : 1,
'output_dims' : None
}
}

BATCH_SIZE = 512
NUM_WORKERS = 4
LEARNING_RATE = 1e-3
NUM_EPOCHS = 100000

SAVE_PATH = '{0}/'.format(__file__.split('.')[0]) # remove.py and create a folder
print("Saving at {0}.".format(SAVE_PATH))

# DEVICE = torch.device('cuda')
DEVICE = torch.device('cuda:0')

"""
"""

# Main Script
def train():
# Create the autoencoder.
ae = AutoEncoder(DEFAULT_BUILD).to(DEVICE)

# Instantiate the data loader.
db = np.load(DATA_FOLDER + 'simulation_data_X.npy')
db = db[:80000, :] # Training

db_dl = DataLoader(db, batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS, shuffle=True)

# Instantiate the loss function.
loss_function = nn.MSELoss().to(DEVICE)

# Create our trainer.
trainer = Trainer(ae, loss_function, LEARNING_RATE, db_dl, DEVICE)
trainer.train(NUM_EPOCHS, SAVE_PATH)

if __name__ == '__main__':
train()
Loading

0 comments on commit b36f47e

Please sign in to comment.