-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b36f47e
Showing
15 changed files
with
1,580 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# DeepMatch - Simulation | ||
|
||
Repository containing code for performing DeepMatch experiments on synthetic data. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import numpy as np | ||
import h5py | ||
import tables | ||
import torch | ||
import torch.nn as nn | ||
from torch.utils.data import DataLoader | ||
|
||
ALLOWED_STATES = ['TRAIN', 'TEST', 'VAL'] | ||
|
||
class SimulationDataset(torch.utils.data.Dataset): | ||
def __init__(self, data_fn, state, **kwargs): | ||
""" Manages loading of NIS database samples in a H5 file, particularly for training. | ||
""" | ||
# super(NISDatabase, self).__init__() | ||
self.filename = data_fn | ||
self.dataset = np.load(data_fn) | ||
self.dataset_key = kwargs.get('dataset_key', 'dataset') | ||
|
||
self.state_inds = kwargs.get('state_inds', {}) | ||
self.allowed_states = kwargs.get('allowed_states', []) | ||
self.pin_memory = kwargs.get('pin_memory', True) | ||
self.change_state(state) | ||
|
||
self.batch_size = kwargs.get('batch_size', 1000) | ||
self.iterator = DataLoader(self, batch_size=self.batch_size, pin_memory=self.pin_memory, num_workers=0) | ||
|
||
def __getitem__(self, index): | ||
""" Load input data. | ||
Note that index really indexes into inds for training, test, val split. | ||
Also, loading data in worker processes and opening the HDF5 file *once* | ||
in __get_item__ ensures multithreading from the DataLoader class | ||
works appropriately. See | ||
https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/16 | ||
for full discussion. | ||
@TODO consider using torch.nn.data.Sampler instead of code below. | ||
""" | ||
|
||
state_index = self.inds[index] | ||
return self.dataset[state_index] | ||
|
||
def __len__(self): | ||
""" Return length of dataset we're operating on. | ||
""" | ||
return self.dataset_len | ||
|
||
def set_inds(self, inds): | ||
self.dataset_len = inds.shape[0] | ||
self.inds = inds | ||
|
||
def set_dataset_key(self, key): | ||
self.dataset_key = key | ||
self.dataset = None | ||
|
||
def set_batch_size(self, batch_size): | ||
self.batch_size = batch_size | ||
self.iterator = DataLoader(self, batch_size=batch_size, pin_memory=self.pin_memory, num_workers=0) | ||
|
||
def change_state(self, state): | ||
""" Change attributes based on state of model. | ||
""" | ||
|
||
if len(list(self.state_inds.keys())) == 0: | ||
# If we want the full dataset, this is how we specify it. | ||
self.state_inds = { | ||
'full' : np.arange(self.dataset.shape[0]) | ||
} | ||
self.allowed_states = ['full', ] | ||
|
||
else: | ||
# If we've already declared unique states, then just move on. | ||
if type(self.state_inds) == type({}): | ||
self.allowed_states = list(self.state_inds.keys()) | ||
|
||
if state not in self.allowed_states: | ||
raise ValueError(f'{state} not found.') | ||
|
||
self.set_inds(self.state_inds[state]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.datasets import make_regression | ||
|
||
|
||
def generate_dataset_linear(n_samples=500, set_X=None): | ||
""" | ||
Generate samples from two correlated distributions and allow for setting a specific | ||
value for X to simulate a RCT where patient's are assigned to experimental vs control groups. | ||
X is a bernoulli variable sampled from binomial distribution. | ||
Z is a confounder sampled from a uniform distribution. | ||
Y is a continuous real number. | ||
The probabilistic model for this is: | ||
X --> Y | ||
Z --> X | ||
Z --> Y | ||
Args: | ||
n_samples (int): no. samples to generate | ||
set_X (arr): numpy array to set_X to a specific interventoin to simulate a RCT | ||
Returns: | ||
samples (pandas.DataFarme): a pandas dataframe of sampled data in the form [x, y, z] | ||
""" | ||
|
||
z = np.random.uniform(size=n_samples) | ||
|
||
if set_X is not None: | ||
assert(len(set_X) == n_samples) | ||
x = set_X | ||
else: | ||
p_x = np.minimum(np.maximum(z,0.1), 0.9) | ||
x = np.random.binomial(n=1, p=p_x, size=n_samples) | ||
|
||
y0 = 2 * z | ||
y1 = y0 - 0.5 | ||
|
||
y = np.where(x == 0, y0, y1) + 0.3 * np.random.normal(size=n_samples) | ||
|
||
return pd.DataFrame({"x":x, "y":y, "z":z}) | ||
|
||
|
||
def generate_dataset_regression(n_features, rank, noise=0): | ||
""" | ||
Generate a dataset of 100,000 samples with n_features features and of rank 'rank'. | ||
Args: | ||
n_features (int): no. of features to generate | ||
rank (int): rank of dataset | ||
noise (float): noise to add to the | ||
Returns: | ||
dataset (pandas.Dataframe): a pandas Dataframe of regression data. | ||
""" | ||
|
||
return make_regression(n_samples=100000, n_features=n_features, rank=rank, noise=noise) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
def logistic_fx(x): | ||
""" | ||
Simplified logistic function from ML where L = 1 and k = 1, and x_0 = 0. | ||
Maps a real number from (-inf, inf) to (0,1) centered on x=0. | ||
Args: | ||
x (float): input | ||
Returns: | ||
y (float): output | ||
""" | ||
|
||
y = 1 / (1 + np.exp(-x)) | ||
|
||
return y | ||
|
||
|
||
def estimate_effect(df): | ||
""" | ||
Estimated effect is the difference in response due to a treatment/intervention as measured in a | ||
RCT or A/B test. We can simply calculate this as a difference in means with a | ||
95% CI between the responses of our control and experimental groups. | ||
Args: | ||
df (pandas.DataFrame): a dataframe of samples. | ||
Returns: | ||
estimated_effect (dict[Str: float]): dictionary containing the difference in means for the | ||
treated and untreated samples and the "standard_error" - 90% confidence intervals arround "estimated_effect" | ||
""" | ||
|
||
base = df[df.x == 0] | ||
variant = df[df.x == 1] | ||
delta = variant.y.mean() - base.y.mean() | ||
delta_err = 1.96 * np.sqrt(variant.y.var() / variant.shape[0] + base.y.var() / base.shape[0]) | ||
|
||
return {"estimated_effect": delta, "standard_error": delta_err} | ||
|
||
|
||
def run_ab_test(datagenerator, n_samples=10000): | ||
""" | ||
Generates n_samples from a datagenerator with the value of X randomized | ||
so that 50% of the samples recieve treatment X=1 and 50% receive X=0, | ||
and feeds the results into `estimate_effect` to get an unbiased | ||
estimate of the average treatment effect. | ||
Args: | ||
datagenerator (method): a datagenerator method from datagen | ||
n_samples (int): an integer describing number of samples to draw | ||
Returns: | ||
estimated_effect (dict[Str: float]): See estimate_effect for details | ||
""" | ||
n_samples_a = int(n_samples / 2) | ||
n_samples_b = n_samples - n_samples_a | ||
|
||
set_X = np.concatenate([np.ones(n_samples_a), np.zeros(n_samples_b)]).astype(np.int64) | ||
ds = datagenerator(n_samples=n_samples, set_X=set_X) | ||
|
||
return estimate_effect(ds) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import os | ||
os.chdir('/home/aisinai/work/repos/deepmatch-simulation') | ||
|
||
import tables | ||
import numpy as np | ||
|
||
import torch | ||
import torch.nn as nn | ||
from torch.utils.data import DataLoader | ||
|
||
from model.autoencoder import AutoEncoder | ||
from trainer.trainer import Trainer | ||
|
||
# Create Global Variables | ||
DATA_FOLDER = 'data/' | ||
|
||
DEFAULT_BUILD = { | ||
'data' : { | ||
'input_features' : 500 | ||
}, | ||
|
||
'embedding' : { | ||
'dims' : 50 | ||
}, | ||
|
||
'encoding' : { | ||
'total_layers' : 1, | ||
'scale' : 4, | ||
'activation' : 'leaky_relu', | ||
}, | ||
|
||
'latent' : {'dimensions' : 64}, | ||
|
||
'decoding' : { | ||
'scale' : 4, | ||
'activation' : 'leaky_relu', | ||
'total_layers' : 1, | ||
'output_dims' : None | ||
} | ||
} | ||
|
||
BATCH_SIZE = 512 | ||
NUM_WORKERS = 4 | ||
LEARNING_RATE = 5e-4 | ||
NUM_EPOCHS = 1000000 | ||
|
||
SAVE_PATH = '{0}/'.format(__file__.split('.')[0]) # remove.py and create a folder | ||
print("Saving at {0}.".format(SAVE_PATH)) | ||
|
||
# DEVICE = torch.device('cuda') | ||
DEVICE = torch.device('cuda:0') | ||
|
||
""" | ||
""" | ||
|
||
# Main Script | ||
def train(): | ||
# Create the autoencoder. | ||
ae = AutoEncoder(DEFAULT_BUILD).to(DEVICE) | ||
|
||
# Instantiate the data loader. | ||
db = np.load(DATA_FOLDER + 'simulation_data_X.npy') | ||
db = db[:80000, :] # Training | ||
|
||
db_dl = DataLoader(db, batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS, shuffle=True) | ||
|
||
# Instantiate the loss function. | ||
loss_function = nn.MSELoss().to(DEVICE) | ||
|
||
# Create our trainer. | ||
trainer = Trainer(ae, loss_function, LEARNING_RATE, db_dl, DEVICE) | ||
trainer.train(NUM_EPOCHS, SAVE_PATH) | ||
|
||
if __name__ == '__main__': | ||
train() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import os | ||
os.chdir('/home/aisinai/work/repos/deepmatch-simulation') | ||
|
||
import tables | ||
import numpy as np | ||
|
||
import torch | ||
import torch.nn as nn | ||
from torch.utils.data import DataLoader | ||
|
||
from model.autoencoder import AutoEncoder | ||
from trainer.trainer import Trainer | ||
|
||
# Create Global Variables | ||
DATA_FOLDER = 'data/' | ||
|
||
DEFAULT_BUILD = { | ||
'data' : { | ||
'input_features' : 500 | ||
}, | ||
|
||
'encoding' : { | ||
'total_layers' : 1, | ||
'scale' : 4, | ||
'activation' : 'leaky_relu', | ||
}, | ||
|
||
'latent' : {'dimensions' : 32}, | ||
|
||
'decoding' : { | ||
'scale' : 4, | ||
'activation' : 'leaky_relu', | ||
'total_layers' : 1, | ||
'output_dims' : None | ||
} | ||
} | ||
|
||
BATCH_SIZE = 512 | ||
NUM_WORKERS = 4 | ||
LEARNING_RATE = 1e-3 | ||
NUM_EPOCHS = 100000 | ||
|
||
SAVE_PATH = '{0}/'.format(__file__.split('.')[0]) # remove.py and create a folder | ||
print("Saving at {0}.".format(SAVE_PATH)) | ||
|
||
# DEVICE = torch.device('cuda') | ||
DEVICE = torch.device('cuda:0') | ||
|
||
""" | ||
""" | ||
|
||
# Main Script | ||
def train(): | ||
# Create the autoencoder. | ||
ae = AutoEncoder(DEFAULT_BUILD).to(DEVICE) | ||
|
||
# Instantiate the data loader. | ||
db = np.load(DATA_FOLDER + 'simulation_data_X.npy') | ||
db = db[:80000, :] # Training | ||
|
||
db_dl = DataLoader(db, batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS, shuffle=True) | ||
|
||
# Instantiate the loss function. | ||
loss_function = nn.MSELoss().to(DEVICE) | ||
|
||
# Create our trainer. | ||
trainer = Trainer(ae, loss_function, LEARNING_RATE, db_dl, DEVICE) | ||
trainer.train(NUM_EPOCHS, SAVE_PATH) | ||
|
||
if __name__ == '__main__': | ||
train() |
Oops, something went wrong.