Skip to content

Commit dcac331

Browse files
add strobealgin to Dockerfile for further testing
1 parent 03a33c4 commit dcac331

File tree

5 files changed

+466
-2
lines changed

5 files changed

+466
-2
lines changed

Dockerfile

+11-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ RUN apt-get -y install \
3131
python3-dev \
3232
python3-pip \
3333
libjemalloc-dev \
34+
libisal-dev \
3435
cmake \
3536
make \
3637
g++ \
@@ -81,6 +82,15 @@ RUN wget https://github.com/samtools/samtools/releases/download/1.21/samtools-1.
8182
&& cd .. \
8283
&& rm -rf samtools-1.21
8384

85+
##install strobealign
86+
RUN git clone https://github.com/ksahlin/strobealign \
87+
&& cd strobealign \
88+
&& cmake -B build -DCMAKE_C_FLAGS="-msse4.2" -DCMAKE_CXX_FLAGS="-msse4.2" \
89+
&& cmake --build build -j 8 \
90+
&& cd ..
91+
92+
ENV PATH /opt/strobealign/build:$PATH
93+
8494
##install bwa-mem
8595
RUN git clone https://github.com/lh3/bwa.git \
8696
&& cd bwa \
@@ -187,4 +197,4 @@ RUN conda create -y -n renv -c conda-forge -c bioconda \
187197
bioconductor-rtracklayer=1.62.0 \
188198
r-randomcolor=1.1.0.1
189199
RUN echo "source activate renv" > ~/.bashrc
190-
ENV PATH /miniconda/envs/renv/bin:$PATH
200+
ENV PATH /miniconda/envs/renv/bin:$PATH

cosigt_smk/workflow/rules/odgi.smk

+50
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,56 @@ rule odgi_paths_matrix:
7878
cut -f 1,4- | gzip > {output}
7979
'''
8080

81+
rule odgi_view_len:
82+
'''
83+
https://github.com/pangenome/odgi
84+
'''
85+
input:
86+
rules.odgi_view.output
87+
output:
88+
config['output'] + '/odgi/view/{region}.len.tsv'
89+
threads:
90+
1
91+
resources:
92+
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
93+
time=lambda wildcards, attempt: attempt * config['default']['time']
94+
container:
95+
'docker://pangenome/odgi:1726671973'
96+
conda:
97+
'../envs/odgi.yaml'
98+
benchmark:
99+
'benchmarks/{region}.odgi_view_len.benchmark.txt'
100+
shell:
101+
'''
102+
grep '^S' {input} | \
103+
awk '{{print("node."$2,length($3))}}' OFS="\\t" > {output}
104+
'''
105+
106+
rule filter_odgi_matrix:
107+
'''
108+
https://github.com/davidebolo1993/cosigt
109+
'''
110+
input:
111+
coverage=rules.odgi_chop.output,
112+
size=rules.odgi_view_len.output
113+
output:
114+
config['output'] + '/odgi/paths/matrix_flt/{region}.tsv.gz'
115+
threads:
116+
1
117+
resources:
118+
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
119+
time=lambda wildcards, attempt: attempt * config['default']['time']
120+
container:
121+
'docker://davidebolo1993/cosigt_workflow:latest'
122+
#conda:
123+
#'../envs/odgi.yaml'
124+
benchmark:
125+
'benchmarks/{region}.filter_odgi_matrix.benchmark.txt'
126+
shell:
127+
'''
128+
flt {input.coverage} {input.size} | gzip > {output}
129+
'''
130+
81131
rule odgi_similarity:
82132
'''
83133
https://github.com/pangenome/odgi

cosigt_smk/workflow/scripts/cluster.r

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ regularMatrix[is.na(regularMatrix)]<-1
4141
distanceMatrix <- as.dist(regularMatrix)
4242

4343
# Calculate silhouette score and best partition
44-
max_cluster <- round(length(unique(df$group.a)) / 3) ##control
44+
max_cluster <- round(length(unique(df$group.a)) / 5) ##control
4545
res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette",
4646
distance = NULL, max.nc = max_cluster)$Best.partition
4747

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
library(data.table)
2+
library(dbscan)
3+
library(rjson)
4+
library(reshape2)
5+
library(reshape2)
6+
library(NbClust)
7+
8+
input_file<-"filt.tsv.gz"
9+
df<-fread(input_file)
10+
11+
for (d in c("euclidean.dist","jaccard.dist","cosine.dissim","manhattan.dist")) {
12+
13+
regularMatrix <- acast(df, group.a ~ group.b, value.var = d)
14+
distanceMatrix<-as.dist(regularMatrix)
15+
pdf(paste0("knn.",d,".pdf"))
16+
kNNdistplot(distanceMatrix,k=2)
17+
dev.off()
18+
kNN_distances <- kNNdist(distanceMatrix, k = 2)
19+
sorted_kNN <- sort(kNN_distances)
20+
first_derivative <- diff(sorted_kNN)
21+
# Step 2: Compute the second derivative
22+
second_derivative <- diff(first_derivative)
23+
# Step 3: Identify the index with the maximum second derivative
24+
optimal_index <- which.max(second_derivative)
25+
# Step 4: Retrieve the corresponding `eps` value
26+
optimal_eps <- sorted_kNN[optimal_index + 1] # +1 d
27+
db<-dbscan(distanceMatrix,minPts=3, eps=4.3)
28+
cl<-db$cluster
29+
names(cl)<-labels(distanceMatrix)
30+
res.list <- lapply(split(cl, names(cl)), unname)
31+
named_res <- lapply(cl, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
32+
jout <- toJSON(named_res)
33+
# Write JSON output
34+
output_file<-paste0("dbscan.",d,".json")
35+
write(jout, output_file)
36+
37+
38+
max_cluster <- round(length(unique(df$group.a)) / 5) ##control
39+
res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette",
40+
distance = NULL, max.nc = max_cluster)$Best.partition
41+
42+
# Format results
43+
res.list <- lapply(split(res, names(res)), unname)
44+
named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
45+
jout <- toJSON(named_res)
46+
47+
# Write JSON output
48+
output_file<-paste0("agglomerative.",d,".json")
49+
write(jout, output_file)
50+
51+
}

0 commit comments

Comments
 (0)