From 371d20971a0253b3f957456ba9ea36cd74af881c Mon Sep 17 00:00:00 2001 From: Anton Kriese Date: Fri, 3 Jun 2022 15:46:36 +0200 Subject: [PATCH] replace param min_samples with dynamic calculation --- workflow/scripts/read_cluster.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/read_cluster.py b/workflow/scripts/read_cluster.py index 80711fd..0256066 100644 --- a/workflow/scripts/read_cluster.py +++ b/workflow/scripts/read_cluster.py @@ -213,7 +213,6 @@ def parse_args(): cluster_parser = subparsers.add_parser('cluster', help="Cluster reads by distance.") cluster_parser.add_argument("sequences", type=Path) cluster_parser.add_argument("distances", type=Path) - cluster_parser.add_argument("--min_samples", type=int, default=5) cluster_parser.add_argument("--eps", type=float, default=0.3) cluster_parser.add_argument("--output", "-o", type=Path, default=None) @@ -229,13 +228,15 @@ def parse_args(): if __name__ == '__main__': args = parse_args() seq_coll = SequenceCollection.from_fasta(args.sequences) + n_reads = seq_coll.n_reads() n_excluded = len(seq_coll.get_labels()) if args.command == "cluster": # Inputs are: distances, output, eps, min_samples # produces a file (output) that contains the cluster labels for all given sequences if args.distances is not None: matrix = read_dist_matrix(args.distances) - clustering = cluster_on_distances(matrix, args.eps, args.min_samples, n_excluded) + clustering = cluster_on_distances(matrix, args.eps, + min_samples=min(n_reads//3+1, 5), exclude_last_n=n_excluded) write_clustering(clustering, args.output) elif args.sequences is None: print("You have to either pass pairwise distances or sequences to be clustered.")