Skip to content

Commit 8fadc34

Browse files
authored
Merge pull request #295 from PNNL-CompBio/sample_id_mapping_update
Cross-Build Sample & Drug ID Mapping
2 parents c39a21a + 358e95c commit 8fadc34

File tree

9 files changed

+1156
-48
lines changed

9 files changed

+1156
-48
lines changed

build/broad_sanger/05b_separate_datasets.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import gc
22
import polars as pl
3-
4-
3+
import os
4+
import gzip
5+
import shutil
56

67
def main():
78
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
@@ -23,58 +24,96 @@ def main():
2324
}
2425

2526
for dataset in datasets_to_process:
26-
exp = pl.read_csv("broad_sanger_experiments.tsv", separator="\t") # Keeping memory down, so I will not be making copies.
27+
exp_in_filename = "broad_sanger_experiments.tsv"
28+
if os.path.isfile(exp_in_filename + ".gz"):
29+
exp_in_filename = exp_in_filename + ".gz"
30+
31+
exp = pl.read_csv(exp_in_filename, separator="\t") # Keeping memory down, so I will not be making copies.
2732
exp = exp.filter(pl.col("study") == dataset)
2833

2934
# Extract information to separate out datasets
3035
exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
3136
exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()
3237

3338
# Write Filtered Experiments File to TSV. Then delete it from memory.
34-
exp_filename = f"/tmp/{dataset}_experiments.tsv".lower()
35-
exp.write_csv(exp_filename, separator="\t")
39+
exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower()
40+
exp.write_csv(exp_filename_out, separator="\t")
41+
#Rewrite as gzipped if needed
42+
if exp_in_filename.endswith(".gz"):
43+
with open(exp_filename_out, 'rb') as f_in, gzip.open(exp_filename_out + ".gz", 'wb') as f_out:
44+
shutil.copyfileobj(f_in, f_out)
45+
os.remove(exp_filename_out)
46+
3647
del exp
3748
gc.collect()
3849

3950

4051
#Filter Samples files, write to file, delete from mem.
4152
for samples in samples_datatypes:
4253
samples_filename_in = f"broad_sanger_{samples}.csv"
54+
if os.path.isfile(samples_filename_in + ".gz"):
55+
samples_filename_in += ".gz"
56+
4357
samples_filename_out = f"/tmp/{dataset}_{samples}.csv".lower()
4458
samples_df = pl.read_csv(samples_filename_in)
4559
samples_df = samples_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
4660
samples_df.write_csv(samples_filename_out) #csv
61+
62+
#Rewrite as gzipped if needed
63+
if samples_filename_in.endswith(".gz"):
64+
with open(samples_filename_out, 'rb') as f_in, gzip.open(samples_filename_out + ".gz", 'wb') as f_out:
65+
shutil.copyfileobj(f_in, f_out)
66+
os.remove(samples_filename_out)
67+
4768
del samples_df
4869
gc.collect()
4970

5071
#One by one, filter other Omics files, write to file, delete from mem.
5172
for omics in omics_datatypes:
5273
omics_filename_in = f"broad_sanger_{omics}.csv"
74+
if os.path.isfile(omics_filename_in + ".gz"):
75+
omics_filename_in += ".gz"
76+
5377
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
5478
omics_df = pl.read_csv(omics_filename_in)
5579
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
5680
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
5781
omics_df.write_csv(omics_filename_out) #csv
82+
83+
#Rewrite as gzipped if needed
84+
if omics_filename_in.endswith(".gz"):
85+
with open(omics_filename_out, 'rb') as f_in, gzip.open(omics_filename_out + ".gz", 'wb') as f_out:
86+
shutil.copyfileobj(f_in, f_out)
87+
os.remove(omics_filename_out)
88+
5889
del omics_df
5990
gc.collect()
6091

6192

6293
#One by one, filter other Drugs files, write to file, delete from mem.
6394
for drugs in drugs_datatypes:
6495
drugs_filename_in = f"broad_sanger_{drugs}.tsv"
96+
if os.path.isfile(drugs_filename_in + ".gz"):
97+
drugs_filename_in += ".gz"
98+
6599
drugs_filename_out = f"/tmp/{dataset}_{drugs}.tsv".lower()
66100
if drugs == "drug_descriptors":
67101
drugs_df = pl.read_csv(drugs_filename_in,separator="\t",
68102
dtypes={"improve_drug_id": pl.Utf8,
69103
"structural_descriptor": pl.Utf8,
70104
"descriptor_value": pl.Utf8}
71105
)
72-
73106
else:
74107
drugs_df = pl.read_csv(drugs_filename_in,separator="\t")
75108

76109
drugs_df = drugs_df.filter(pl.col("improve_drug_id").is_in(exp_improve_drug_ids))
77110
drugs_df.write_csv(drugs_filename_out,separator="\t") #tsv
111+
112+
if drugs_filename_in.endswith(".gz"):
113+
with open(drugs_filename_out, 'rb') as f_in, gzip.open(drugs_filename_out + ".gz", 'wb') as f_out:
114+
shutil.copyfileobj(f_in, f_out)
115+
os.remove(drugs_filename_out)
116+
78117
del drugs_df
79118
gc.collect()
80119

build/build_all.py

Lines changed: 64 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import gzip
1212
from glob import glob
1313
import sys
14+
import requests
1415

1516
def main():
1617
parser=argparse.ArgumentParser(
@@ -131,7 +132,7 @@ def process_docker(datasets):
131132
datasets_to_build.extend(dataset_map.get(dataset, []))
132133

133134
# Build the docker-compose command, adding specific datasets
134-
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
135+
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
135136

136137
log_file_path = 'local/docker.log'
137138
env = os.environ.copy()
@@ -266,9 +267,11 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
266267
docker_run = ['docker', 'run', '--rm', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '-e', f"VERSION={version}"]
267268

268269
# Add Appropriate Environment Variables
270+
if name == "validate":
271+
docker_run.extend(['upload'])
269272
if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
270273
docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
271-
if name == "validate":
274+
if name == "Map_Drugs" or name == "Map_Samples":
272275
docker_run.extend(['upload'])
273276
if 'GITHUB_TOKEN' in env and name == "GitHub":
274277
docker_run.extend(['-e', f"GITHUB_TOKEN={env['GITHUB_TOKEN']}", 'upload'])
@@ -300,6 +303,18 @@ def compress_file(file_path):
300303
with gzip.open(compressed_file_path, 'wb') as f_out:
301304
shutil.copyfileobj(f_in, f_out)
302305
os.remove(file_path)
306+
307+
def get_latest_commit_hash(owner, repo, branch='main'):
308+
"""
309+
Returns the SHA of the latest commit on the specified branch.
310+
"""
311+
url = f"https://api.github.com/repos/{owner}/{repo}/commits/{branch}"
312+
response = requests.get(url)
313+
response.raise_for_status()
314+
315+
# The commit data is in JSON format; the 'sha' field is the full commit hash.
316+
commit_data = response.json()
317+
return commit_data['sha']
303318

304319
######
305320
### Pre-Build Environment Token Check
@@ -388,17 +403,17 @@ def compress_file(file_path):
388403
######
389404
### Begin Upload and/or validation
390405
#####
391-
392-
if args.figshare or args.validate:
406+
if args.figshare or args.validate or github_token:
407+
# if args.figshare or args.validate:
393408
# FigShare File Prefixes:
409+
394410
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
395411
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
396412
if "broad_sanger" in datasets:
397413
prefixes.extend(broad_sanger_datasets)
398414
datasets.extend(broad_sanger_datasets)
399415
datasets.remove("broad_sanger")
400416

401-
402417
figshare_token = os.getenv('FIGSHARE_TOKEN')
403418

404419
all_files_dir = 'local/all_files_dir'
@@ -422,6 +437,13 @@ def compress_file(file_path):
422437
for file in glob(os.path.join(all_files_dir, '*.gz')):
423438
decompress_file(file)
424439

440+
### These should be done before schema checking.
441+
sample_mapping_command = ['python3', 'scripts/map_improve_sample_ids.py', '--local_dir', "/tmp", '--version', args.version]
442+
run_docker_upload_cmd(sample_mapping_command, 'all_files_dir', 'Map_Samples', args.version)
443+
444+
drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
445+
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
446+
425447
# Run schema checker - This will always run if uploading data.
426448
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
427449
run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
@@ -438,28 +460,47 @@ def compress_file(file_path):
438460

439461
print("File compression and decompression adjustments are complete.")
440462

441-
# Upload to Figshare using Docker
463+
### Upload to Figshare using Docker
442464
if args.figshare and args.version and figshare_token:
443-
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
465+
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--version', args.version, '--publish']
444466
run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)
445467

468+
### Push changes to GitHub using Docker
469+
# if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
446470

447-
# Push changes to GitHub using Docker
448-
if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
449-
git_command = [
450-
'bash', '-c', (
451-
f'git config --global user.name "{args.github_username}" '
452-
f'&& git config --global user.email "{args.github_email}" '
453-
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
454-
f'&& git add docs/_data/figshare_latest.yml '
455-
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
456-
f'&& git tag {args.version} '
457-
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git main '
458-
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git --tags'
459-
)
460-
]
461-
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
471+
# You can only upload to Github after Figshare upload is completed - otherwise figshare_latest.yml and dataset.yml won't be available.
472+
if args.version and github_token and args.github_username and args.github_email:
473+
474+
git_command = [
475+
'bash', '-c', (
476+
f'git config --global user.name "{args.github_username}" '
477+
f'&& git config --global user.email "{args.github_email}" '
478+
479+
# Checkout a new branch
480+
f'&& git checkout -b testing-auto-build-pr-{args.version} '
481+
482+
# Copy and add the necessary files
483+
f'&& cp /tmp/improve_sample_mapping.json.gz /usr/src/app/coderdata/build/improve_sample_mapping.json.gz '
484+
f'&& cp /tmp/improve_drug_mapping.json.gz /usr/src/app/coderdata/build/improve_drug_mapping.json.gz '
485+
f'&& gunzip /usr/src/app/coderdata/build/*.gz '
486+
f'&& git add -f build/improve_sample_mapping.json build/improve_drug_mapping.json '
487+
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
488+
f'&& cp /tmp/dataset.yml /usr/src/app/coderdata/coderdata/dataset.yml '
489+
f'&& git add -f docs/_data/figshare_latest.yml coderdata/dataset.yml'
490+
491+
# Tag and push
492+
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
493+
f'&& git tag {args.version} '
494+
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git testing-auto-build-pr-{args.version} '
495+
496+
# Create a PR using GitHub CLI
497+
f'&& gh pr create --title "Testing Auto PR instead of auto Merge {args.version}" '
498+
f'--body "This PR was automatically generated by the build process." '
499+
f'--base main --head testing-auto-build-pr-{args.version}'
500+
)
501+
]
502+
503+
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
462504

463-
464505
if __name__ == '__main__':
465506
main()

build/build_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def process_docker(dataset,validate):
5656

5757
datasets_to_build.extend(dataset_map.get(dataset, []))
5858

59-
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
59+
compose_command = ['docker','compose', '-f', compose_file, 'build'] + datasets_to_build
6060

6161
log_file_path = 'local/docker.log'
6262
env = os.environ.copy()
@@ -260,7 +260,7 @@ def run_schema_checker(dataset):
260260
decompress_file(os.path.join('local', all_files_dir, file))
261261

262262
# Run schema checker
263-
schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
263+
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
264264
run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')
265265

266266
def main():

build/docker/Dockerfile.upload

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,20 @@ RUN python -m pip install --upgrade pip pyyaml requests linkml
66

77
RUN apt-get update && apt-get install -y git
88

9+
# Install necessary system packages: git, curl, and gpg
10+
RUN apt-get update && \
11+
apt-get install -y git curl gnupg && \
12+
rm -rf /var/lib/apt/lists/*
13+
14+
# Install GitHub CLI (gh)
15+
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | \
16+
gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg && \
17+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | \
18+
tee /etc/apt/sources.list.d/github-cli.list > /dev/null && \
19+
apt-get update && \
20+
apt-get install -y gh && \
21+
rm -rf /var/lib/apt/lists/*
22+
923

1024
RUN git clone https://github.com/PNNL-CompBio/coderdata.git
11-
WORKDIR /usr/src/app/coderdata
25+
WORKDIR /usr/src/app/coderdata

build/genes/00-buildGeneFile.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,3 @@ write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
5555

5656
##store this file somewhere!
5757

58-

0 commit comments

Comments
 (0)