Skip to content

Commit be3ccc7

Browse files
committed
reduced multiprocessing load, and updated MPNST
1 parent 5b248dd commit be3ccc7

File tree

3 files changed

+40
-22
lines changed

3 files changed

+40
-22
lines changed

build/broad_sanger/04-drug_dosage_and_curves.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
samplefile = opts.samplefile
2525
drugfile = opts.dfile
2626

27-
cmd = 'python 04b-nci60-updated.py --sampleFile '+samplefile+' --drugFile '+drugfile
27+
cmd = '/opt/venv/bin/python 04b-nci60-updated.py --sampleFile '+samplefile+' --drugFile '+drugfile
2828
print(cmd)
2929
os.system(cmd)
3030

build/mpnst/01_mpnst_get_omics.R

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -149,26 +149,44 @@ cnv<-do.call(rbind,lapply(setdiff(combined$CopyNumber,NA),function(x){
149149
sample<-subset(combined,CopyNumber==x)
150150
print(sample$improve_sample_id)
151151
res<-fread(synGet(x2)$path)
152-
long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)]
153-
filtered_df <- long_df |>
154-
subset(is.finite(log2))|>
155-
filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols
156-
filtered_df <- filtered_df[, .(gene_symbol = V1,
157-
improve_sample_id = sample$improve_sample_id[1],
158-
copy_number = 2^log2,
159-
source = "NF Data Portal",
160-
study = "MPNST PDX MT")]
161-
res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
162-
dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
163-
ifelse(copy_number<0.7311832,'het loss',
164-
ifelse(copy_number<1.214125,'diploid',
165-
ifelse(copy_number<1.422233,'gain','amp')))))|>
166-
left_join(genes_df)|>
167-
dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|>
168-
subset(!is.na(entrez_id))|>
169-
distinct()
170-
res|>group_by(copy_call)|>summarize(n_distinct(entrez_id))
171-
return(distinct(res))
152+
153+
long_df<- res|>
154+
tidyr::separate_rows(gene,sep=',')|>
155+
dplyr::rename(gene_symbol='gene')|>
156+
dplyr::left_join(genes_df)|>
157+
subset(!is.na(entrez_id))|>
158+
dplyr::select(entrez_id,log2)|>
159+
dplyr::distinct()|>
160+
dplyr::mutate(copy_number=2^log2)
161+
162+
res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
163+
dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
164+
ifelse(copy_number<0.7311832,'het loss',
165+
ifelse(copy_number<1.214125,'diploid',
166+
ifelse(copy_number<1.422233,'gain','amp')))))|>
167+
mutate(study='MPNST PDX MT',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|>
168+
dplyr::distinct()
169+
170+
# long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)]
171+
# filtered_df <- long_df |>
172+
# subset(is.finite(log2))|>
173+
# filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols
174+
# filtered_df <- filtered_df[, .(gene_symbol = V1,
175+
# improve_sample_id = sample$improve_sample_id[1],
176+
# copy_number = 2^log2,
177+
# source = "NF Data Portal",
178+
# study = "MPNST PDX MT")]
179+
# res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
180+
# dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
181+
# ifelse(copy_number<0.7311832,'het loss',
182+
# ifelse(copy_number<1.214125,'diploid',
183+
# ifelse(copy_number<1.422233,'gain','amp')))))|>
184+
# left_join(genes_df)|>
185+
# dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|>
186+
# subset(!is.na(entrez_id))|>
187+
# distinct()
188+
# res|>group_by(copy_call)|>summarize(n_distinct(entrez_id))
189+
return(res)
172190
# }
173191
}))
174192

build/utils/fit_curve.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
163163
count = count or (4484081 - start)
164164
groups = islice(groups, start, start+count)
165165

166-
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
166+
with multiprocessing.Pool(processes=multiprocessing.cpu_count()/4) as pool:
167167
results = pool.map(process_single_drug, groups)
168168

169169
with open(f'{fname}.{start}', 'w') as f:

0 commit comments

Comments
 (0)