[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Help needed: Bug#962292: ITP: flye -- de novo assembler for single molecule sequencing reads using repeat graphs



Hi Steffen,

On Sat, Jun 06, 2020 at 03:41:44PM +0200, Steffen Möller wrote:
> > $ /usr/bin/minimap2 out_pacbio/10-consensus/chunks.fasta E.coli_PacBio_40x.fasta -x map-pb -t 4 -a -p 0.5 -N 10 --sam-hit-only -L -Q --secondary-seq
> > [ERROR] unknown option in "E.coli_PacBio_40x.fasta"
> > Exit code:   1
> 
> A look at -h and the .1 suggests that the order of arguments is
> unfortunate. Options should be first. Please kindly try
> 
> /usr/bin/minimap2 -x map-pb -t 4 -a -p 0.5 -N 10 --sam-hit-only -L -Q out_pacbio/10-consensus/chunks.fasta E.coli_PacBio_40x.fasta
> 
> and then add the "--secondary-seq" argument and see it fail.
> 
> At https://github.com/fenderglass/Flye/issues/249 I get the impression that the --secondary-seq is not an error and at
> https://github.com/fenderglass/Flye/blob/b0107d8a4806a42aa8a16667bb1abb93ab9afd56/lib/minimap2/main.c#L69
> I see it implemented which I do not see in the original
> https://github.com/lh3/minimap2/blob/master/main.c
> 
> I do not have an immediate answer to what to do now. Upstream calls their minimap "flye-minimap". So, maybe embedding that binary with the flye binary may indeed be an option.

The suspicion that the minimap2 code copy in flye is patched.  I have
attached the diff between the Debian packaged minimap2 which actually
introduces --secondary-seq.

I wonder if there is some volunteer to talk to both upstreams to
merge the patch.

Kind regards

       Andreas. 

-- 
http://fam-tille.de
diff -ubrN minimap2/format.c flye-minimap2/format.c
--- minimap2/format.c	2019-08-01 15:06:53.638518907 +0200
+++ flye-minimap2/format.c	2020-04-24 22:34:42.000000000 +0200
@@ -367,14 +367,18 @@
 		clip_len[0] = r->rev? qlen - r->qe : r->qs;
 		clip_len[1] = r->rev? r->qs : qlen - r->qe;
 		if (in_tag) {
-			int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 5 : 4;
+			//int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 5 : 4;
+			int clip_char = ((sam_flag&0x800 || (sam_flag&0x100 && opt_flag&MM_F_SECONDARY_SEQ)) &&
+							 !(opt_flag&MM_F_SOFTCLIP)) ? 5 : 4;
 			mm_sprintf_lite(s, "\tCG:B:I");
 			if (clip_len[0]) mm_sprintf_lite(s, ",%u", clip_len[0]<<4|clip_char);
 			for (k = 0; k < r->p->n_cigar; ++k)
 				mm_sprintf_lite(s, ",%u", r->p->cigar[k]);
 			if (clip_len[1]) mm_sprintf_lite(s, ",%u", clip_len[1]<<4|clip_char);
 		} else {
-			int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 'H' : 'S';
+			//int clip_char = (sam_flag&0x800) && !(opt_flag&MM_F_SOFTCLIP)? 'H' : 'S';
+			int clip_char = ((sam_flag&0x800 || (sam_flag&0x100 && opt_flag&MM_F_SECONDARY_SEQ)) &&
+							 !(opt_flag&MM_F_SOFTCLIP)) ? 'H' : 'S';
 			assert(clip_len[0] < qlen && clip_len[1] < qlen);
 			if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
 			for (k = 0; k < r->p->n_cigar; ++k)
@@ -449,7 +453,7 @@
 		if (cigar_in_tag) {
 			int slen;
 			if ((flag & 0x900) == 0 || (opt_flag & MM_F_SOFTCLIP)) slen = t->l_seq;
-			else if (flag & 0x100) slen = 0;
+			else if ((flag & 0x100) && !(opt_flag & MM_F_SECONDARY_SEQ)) slen = 0;
 			else slen = r->qe - r->qs;
 			mm_sprintf_lite(s, "%dS%dN", slen, r->re - r->rs);
 		} else write_sam_cigar(s, flag, 0, t->l_seq, r, opt_flag);
@@ -490,7 +494,7 @@
 			mm_sprintf_lite(s, "\t");
 			if (t->qual) sam_write_sq(s, t->qual, t->l_seq, r->rev, 0);
 			else mm_sprintf_lite(s, "*");
-		} else if (flag & 0x100) {
+		} else if ((flag & 0x100) && !(opt_flag & MM_F_SECONDARY_SEQ)){
 			mm_sprintf_lite(s, "*\t*");
 		} else {
 			sam_write_sq(s, t->seq + r->qs, r->qe - r->qs, r->rev, r->rev);
diff -ubrN minimap2/main.c flye-minimap2/main.c
--- minimap2/main.c	2019-08-01 15:06:53.638518907 +0200
+++ flye-minimap2/main.c	2020-04-24 22:34:42.000000000 +0200
@@ -66,6 +66,7 @@
 	{ "junc-bed",       ko_required_argument, 340 },
 	{ "junc-bonus",     ko_required_argument, 341 },
 	{ "sam-hit-only",   ko_no_argument,       342 },
+	{ "secondary-seq",  ko_no_argument,       343 },
 	{ "help",           ko_no_argument,       'h' },
 	{ "max-intron-len", ko_required_argument, 'G' },
 	{ "version",        ko_no_argument,       'V' },
@@ -209,6 +210,7 @@
 		else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen
 		else if (c == 340) junc_bed = o.arg; // --junc-bed
 		else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only
+		else if (c == 343) opt.flag |= MM_F_SECONDARY_SEQ; // --secondary-seq
 		else if (c == 314) { // --frag
 			yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
 		} else if (c == 315) { // --secondary
diff -ubrN minimap2/minimap.h flye-minimap2/minimap.h
--- minimap2/minimap.h	2019-08-01 15:06:53.642518911 +0200
+++ flye-minimap2/minimap.h	2020-04-24 22:34:42.000000000 +0200
@@ -36,6 +36,7 @@
 #define MM_F_NO_END_FLT    0x10000000
 #define MM_F_HARD_MLEVEL   0x20000000
 #define MM_F_SAM_HIT_ONLY  0x40000000
+#define MM_F_SECONDARY_SEQ 0x80000000	//output SEQ field for seqondary alignments using hard clipping
 
 #define MM_I_HPC          0x1
 #define MM_I_NO_SEQ       0x2

Reply to: