BioSystemsUM · marta-seq · Feb 23, 2022 · Feb 24, 2022 · Feb 24, 2022 · Feb 24, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,7 @@
 build/*
 __pycache__/*
 .idea/*
-dist/*
+dist/*
+venv/
+.DS_Store
+__pycache__
diff --git a/docs/_guides/propythia_descriptors_2021.pdf b/docs/_guides/propythia_descriptors_2021.pdf
diff --git a/docs/_guides/propythia_user_guide_2021.pdf b/docs/_guides/propythia_user_guide_2021.pdf
diff --git a/requirements → requirements_dna b/requirements → requirements_dna
diff --git a/src/propythia/DNA/.gitignore b/src/propythia/DNA/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+.ipynb_checkpoints/
+.mypy_cache/
+.vscode/
+datasets/
+src_old
+backup/
diff --git a/src/propythia/DNA/README.md b/src/propythia/DNA/README.md
@@ -0,0 +1,21 @@
+# Note
+
+## Machine Learning Part
+
+* `data` is where the physicochemical indices are stored, which are used to calculate some descriptors.
+* `descriptors.py` is the file that contains the calculation of all descriptors for a given sequence.
+* `calculate_features.py` is a script that calculates all descriptors for an entire dataset (with the help of `descriptors.py`) and creates a dataframe with all the descriptors.
+* `notebooks/quick-start-ML.ipynb` is a notebook that explains how to perform every step of the developed modules. It includes data reading and validation, calculation of descriptors from sequences, descriptors processing and using processed descriptors to train ML models (already implemented in ProPythia).
+
+## Deep Learning Part
+
+* `deep_ml.py` runs a combination of set hyperparameters or performs hyperparameter tuning for the given model, feature mode, and data directory.
+* `outputs` is a directory where the output of the hyperparameter tuning is stored. Only the filtered results with the score of each model is stored in the directory.
+* `src` is a directory where the source code of the entire DL pipeline is stored.
+* `essential_genes` is a directory where all the information about the essential genes is stored since it was needed a lot of data preprocessing to build the dataset.
+* `config.json` is a file that contains the configuration of the entire DL pipeline.
+
+## Both Parts
+
+* `utils.py` is a file that contains some useful functions.
+* `read_sequence.py` is the file that contains functions to read and validate DNA sequences. They can be read from a *CSV* file, a *FASTA* file, or from a single string.
diff --git a/src/propythia/DNA/calculate_features.py b/src/propythia/DNA/calculate_features.py
@@ -0,0 +1,133 @@
+import pandas as pd
+from typing import List
+from descriptors import DNADescriptor
+
+def _calculate_descriptors(data: pd.DataFrame, descriptor_list: List) -> pd.DataFrame:
+    """
+    From a dataset of sequences and labels, this function calculates the descriptors and returns a dataframe with them.
+    The user can also specify which descriptors to calculate.
+    """
+    list_feature = []
+    count = 0
+    for seq in data['sequence']:
+        res = {'sequence': seq}
+        dna = DNADescriptor(seq)
+        features = dna.get_descriptors(descriptor_list)
+        res.update(features)
+        list_feature.append(res)
+
+        # print progress every 100 sequences
+        if count % 100 == 0:
+            print(count, '/', len(data))
+
+        count += 1
+    print("Done!")
+    df = pd.DataFrame(list_feature)
+    return df
+
+
+def _process_lists(fps_x, field):
+    """
+    A helper function to normalize lists.
+    """
+    l = fps_x[field].to_list()
+    new_df = pd.DataFrame(l)
+    new_df.columns = [str(field) + "_" + str(i) for i in new_df.columns]
+    fps_x.drop(field, axis=1, inplace=True)
+    return new_df
+
+
+def _process_lists_of_lists(fps_x, field):
+    """
+    A helper function to normalize lists of lists.
+    """
+    l = fps_x[field].to_list()
+    new_df = pd.DataFrame(l)
+    new_df.columns = [str(field) + "_" + str(i) for i in new_df.columns]
+    empty_val = {} if field == "enhanced_nucleic_acid_composition" else []
+    small_processed = []
+    for f in new_df.columns:
+        col = [empty_val if i is None else i for i in new_df[f].to_list()]
+        sub = pd.DataFrame(col)
+        sub.columns = [str(f) + "_" + str(i) for i in sub.columns]
+        small_processed.append(sub)
+    fps_x.drop(field, axis=1, inplace=True)
+    return small_processed
+
+
+
+def normalization(fps_x, descriptor_list):
+    """
+    Because the model cannot process data in dictionaries and lists, the descriptors that produce these forms must still be normalized.
+
+    To normalize the data, dicts and lists need to "explode" into more columns. 
+
+    E.g. dicts:
+
+    | descriptor_hello |
+    | ---------------- |
+    | {'a': 1, 'b': 2} |
+
+    will be transformed into:
+
+    | descriptor_hello_a | descriptor_hello_b |
+    | ------------------ | ------------------ |
+    | 1                  | 2                  |
+
+    E.g. lists:
+
+    | descriptor_hello |
+    | ---------------- |
+    | [1, 2, 3]        |
+
+    will be transformed into:
+
+    | descriptor_hello_0 | descriptor_hello_1 | descriptor_hello_2 |
+    | ------------------ | ------------------ | ------------------ |
+    | 1                  | 2                  | 3                  |
+    """
+    lists = ["nucleic_acid_composition", "dinucleotide_composition", "trinucleotide_composition",
+             "k_spaced_nucleic_acid_pairs", "kmer", "PseDNC", "PseKNC", "DAC", "DCC", "DACC", "TAC", "TCC", "TACC"]
+    lists_of_lists = [
+        "accumulated_nucleotide_frequency"
+    ]
+
+    # update to be normalized lists with only columns the user wants
+    if(descriptor_list != []):
+        lists = [l for l in lists if l in descriptor_list]
+        lists_of_lists = [l for l in lists_of_lists if l in descriptor_list]
+
+    small_processed = []
+    for i in lists:
+        new_df = _process_lists(fps_x, i)
+        small_processed.append(new_df)
+
+    for i in lists_of_lists:
+        smaller_processed = _process_lists_of_lists(fps_x, i)
+        small_processed += smaller_processed
+
+    new_fps_x = pd.concat([fps_x, *small_processed], axis=1)
+    return new_fps_x
+
+
+def calculate_and_normalize(data: pd.DataFrame, descriptor_list: list = []) -> pd.DataFrame:
+    """
+    This function calculates the descriptors and normalizes the data all at once from a dataframe of sequences and labels. The user can also specify which descriptors to calculate.
+    """
+    features = _calculate_descriptors(data, descriptor_list)
+    if 'label' in data:
+        fps_y = data['label']
+    else:
+        fps_y = None
+    fps_x = features.loc[:, features.columns != 'label']
+    fps_x = fps_x.loc[:, fps_x.columns != 'sequence']
+    fps_x = normalization(fps_x, descriptor_list)
+    return fps_x, fps_y
+
+if __name__ == "__main__":
+    from read_sequence import ReadDNA
+    reader = ReadDNA()
+    filename = 'datasets/primer/dataset.csv'
+    data = reader.read_csv(filename=filename, with_labels=True)
+    fps_x, fps_y = calculate_and_normalize(data)
+    print(fps_x)
diff --git a/src/propythia/DNA/config.json b/src/propythia/DNA/config.json
@@ -0,0 +1,34 @@
+{
+    "combination":{
+        "model_label": "bi_lstm",
+        "mode": "chemical",
+        "data_dir": "essential_genes_100k_cut",
+        "class_weights": [1.0, 1.0]
+    },
+    "do_tuning": true,
+    "fixed_vals":{
+        "epochs": 500,
+        "optimizer_label": "adam",
+        "loss_function": "cross_entropy",
+        "patience": 2,
+        "output_size": 2,
+        "cpus_per_trial": 2, 
+        "gpus_per_trial": 2,
+        "num_samples": 5,
+        "kmer_one_hot": 2
+    },
+    "hyperparameters": {
+        "hidden_size": 32,
+        "lr": 1e-3,
+        "batch_size": 32,
+        "dropout": 0.35,
+        "num_layers": 1
+    },
+    "hyperparameter_search_space": {
+        "hidden_size": [32, 64, 128],
+        "lr": [1e-4, 1e-3, 1e-2],
+        "batch_size": [16, 32, 64],
+        "dropout": [0.2, 0.3, 0.4, 0.5],
+        "num_layers": [1, 2, 3]
+    }
+}
diff --git a/src/propythia/DNA/data/mmc3.data b/src/propythia/DNA/data/mmc3.data
diff --git a/src/propythia/DNA/data/mmc4.data b/src/propythia/DNA/data/mmc4.data
diff --git a/src/propythia/DNA/deep_ml.py b/src/propythia/DNA/deep_ml.py
@@ -0,0 +1,47 @@
+"""
+########################################################################
+Runs a combination of hyperparameters or performs hyperparameter tuning
+for the given model, feature mode, and data directory.
+########################################################################
+"""
+
+import torch
+import os
+from src.prepare_data import prepare_data
+from src.test import test
+from src.hyperparameter_tuning import hyperparameter_tuning
+from src.train import traindata
+from utils import print_metrics, read_config
+
+os.environ["CUDA_VISIBLE_DEVICES"] = '1,2,3,4,5'
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+def perform(config):
+    if config['do_tuning']:
+        hyperparameter_tuning(device, config)
+    else:
+        model_label = config['combination']['model_label']
+        mode = config['combination']['mode']
+        data_dir = config['combination']['data_dir']
+        class_weights = config['combination']['class_weights']
+        batch_size = config['hyperparameters']['batch_size']
+        kmer_one_hot = config['fixed_vals']['kmer_one_hot']
+        hyperparameters = config['hyperparameters']
+
+        trainloader, testloader, validloader, input_size, sequence_length = prepare_data(
+            data_dir=data_dir,
+            mode=mode,
+            batch_size=batch_size,
+            k=kmer_one_hot,
+        )
+
+        # train the model
+        model = traindata(hyperparameters, device, config, trainloader, validloader, input_size, sequence_length)
+
+        # test the model
+        metrics = test(device, model, testloader)
+        print_metrics(model_label, mode, data_dir, kmer_one_hot, class_weights, metrics)
+
+if __name__ == '__main__':
+    config = read_config(device)
+    perform(config)