add strobealgin to Dockerfile for further testing

davidebolo1993 · davidebolo1993 · commit dcac3317f27d · 2024-12-19T11:13:11.000+01:00
diff --git a/Dockerfile b/Dockerfile
@@ -31,6 +31,7 @@ RUN apt-get -y install \
 	python3-dev \
 	python3-pip \ 
 	libjemalloc-dev \
+	libisal-dev \
 	cmake \
 	make \
 	g++ \
@@ -81,6 +82,15 @@ RUN wget https://github.com/samtools/samtools/releases/download/1.21/samtools-1.
 	&& cd .. \
 	&& rm -rf samtools-1.21
 
+##install strobealign
+RUN git clone https://github.com/ksahlin/strobealign \
+	&& cd strobealign \
+	&& cmake -B build -DCMAKE_C_FLAGS="-msse4.2" -DCMAKE_CXX_FLAGS="-msse4.2" \
+	&& cmake --build build -j 8 \
+	&& cd ..
+
+ENV PATH /opt/strobealign/build:$PATH
+
 ##install bwa-mem
 RUN git clone https://github.com/lh3/bwa.git \
 	&& cd bwa \
@@ -187,4 +197,4 @@ RUN conda create -y -n renv -c conda-forge -c bioconda \
 	bioconductor-rtracklayer=1.62.0 \
 	r-randomcolor=1.1.0.1
 RUN echo "source activate renv" > ~/.bashrc
-ENV PATH /miniconda/envs/renv/bin:$PATH
+ENV PATH /miniconda/envs/renv/bin:$PATH
diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk
@@ -78,6 +78,56 @@ rule odgi_paths_matrix:
 		cut -f 1,4- | gzip > {output}
 		'''
 
+rule odgi_view_len:
+	'''
+	https://github.com/pangenome/odgi
+	'''
+	input:
+		rules.odgi_view.output
+	output:
+		config['output'] + '/odgi/view/{region}.len.tsv'
+	threads:
+		1
+	resources:
+		mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
+		time=lambda wildcards, attempt: attempt * config['default']['time']
+	container:
+		'docker://pangenome/odgi:1726671973'
+	conda:
+		'../envs/odgi.yaml'
+	benchmark:
+		'benchmarks/{region}.odgi_view_len.benchmark.txt'
+	shell:
+		'''
+		grep '^S' {input} | \
+		awk '{{print("node."$2,length($3))}}' OFS="\\t" > {output}
+		'''
+
+rule filter_odgi_matrix:
+	'''
+	https://github.com/davidebolo1993/cosigt
+	'''
+	input:
+		coverage=rules.odgi_chop.output,
+		size=rules.odgi_view_len.output
+	output:
+		config['output'] + '/odgi/paths/matrix_flt/{region}.tsv.gz'
+	threads:
+		1
+	resources:
+		mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
+		time=lambda wildcards, attempt: attempt * config['default']['time']
+	container:
+		'docker://davidebolo1993/cosigt_workflow:latest'
+	#conda:
+		#'../envs/odgi.yaml'
+	benchmark:
+		'benchmarks/{region}.filter_odgi_matrix.benchmark.txt'
+	shell:
+		'''
+		flt {input.coverage} {input.size} | gzip > {output}
+		'''
+
 rule odgi_similarity:
 	'''
 	https://github.com/pangenome/odgi
diff --git a/cosigt_smk/workflow/scripts/cluster.r b/cosigt_smk/workflow/scripts/cluster.r
@@ -41,7 +41,7 @@ regularMatrix[is.na(regularMatrix)]<-1
 distanceMatrix <- as.dist(regularMatrix)
 
 # Calculate silhouette score and best partition
-max_cluster <- round(length(unique(df$group.a)) / 3) ##control
+max_cluster <- round(length(unique(df$group.a)) / 5) ##control
 res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette", 
                distance = NULL, max.nc = max_cluster)$Best.partition
 
diff --git a/cosigt_smk/workflow/scripts/cluster2.r b/cosigt_smk/workflow/scripts/cluster2.r
@@ -0,0 +1,51 @@
+library(data.table)
+library(dbscan)
+library(rjson)
+library(reshape2)
+library(reshape2)
+library(NbClust)
+
+input_file<-"filt.tsv.gz"
+df<-fread(input_file)
+
+for (d in c("euclidean.dist","jaccard.dist","cosine.dissim","manhattan.dist")) {
+
+    regularMatrix <- acast(df, group.a ~ group.b, value.var = d)
+    distanceMatrix<-as.dist(regularMatrix)
+    pdf(paste0("knn.",d,".pdf"))
+    kNNdistplot(distanceMatrix,k=2)
+    dev.off()
+    kNN_distances <- kNNdist(distanceMatrix, k = 2)
+    sorted_kNN <- sort(kNN_distances)
+    first_derivative <- diff(sorted_kNN)
+    # Step 2: Compute the second derivative
+    second_derivative <- diff(first_derivative)
+    # Step 3: Identify the index with the maximum second derivative
+    optimal_index <- which.max(second_derivative)
+    # Step 4: Retrieve the corresponding `eps` value
+    optimal_eps <- sorted_kNN[optimal_index + 1]  # +1 d
+    db<-dbscan(distanceMatrix,minPts=3, eps=4.3)
+    cl<-db$cluster
+    names(cl)<-labels(distanceMatrix)
+    res.list <- lapply(split(cl, names(cl)), unname)
+    named_res <- lapply(cl, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
+    jout <- toJSON(named_res)
+    # Write JSON output
+    output_file<-paste0("dbscan.",d,".json")
+    write(jout, output_file)
+
+
+    max_cluster <- round(length(unique(df$group.a)) / 5) ##control
+    res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette", 
+                distance = NULL, max.nc = max_cluster)$Best.partition
+
+    # Format results
+    res.list <- lapply(split(res, names(res)), unname)
+    named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup")
+    jout <- toJSON(named_res)
+
+    # Write JSON output
+    output_file<-paste0("agglomerative.",d,".json")
+    write(jout, output_file)
+
+}
diff --git a/cosigt_smk/workflow/scripts/filt.go b/cosigt_smk/workflow/scripts/filt.go