From eb469dcf46a09ab9dc20cf2702e6eb40bbe20078 Mon Sep 17 00:00:00 2001 From: biolxy Date: Fri, 8 Mar 2024 16:58:03 +0800 Subject: [PATCH 1/3] Added a new function(umi_skipb) to the program to skip the specified bp number before trim the UMI sequence. --- src/main.cpp | 4 +++- src/options.h | 2 ++ src/umiprocessor.cpp | 18 +++++++++--------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index e93f9d90..5978a4f7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -127,7 +127,8 @@ int main(int argc, char* argv[]){ cmd.add("umi_loc", 0, "specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none", false, ""); cmd.add("umi_len", 0, "if the UMI is in read1/read2, its length should be provided", false, 0); cmd.add("umi_prefix", 0, "if specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default", false, ""); - cmd.add("umi_skip", 0, "if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0", false, 0); + cmd.add("umi_skip", 0, "if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0", false, 0, cmdline::range(1, 100)); + cmd.add("umi_skipb", 0, "if the UMI is in read1/read2, fastp can skip several bases before trim UMI, default is 0", false, 0, cmdline::range(1, 100)); cmd.add("umi_delim", 0, "delimiter to use between the read name and the UMI, default is :", false, ":"); // overrepresented sequence analysis @@ -384,6 +385,7 @@ int main(int argc, char* argv[]){ opt.umi.length = cmd.get("umi_len"); opt.umi.prefix = cmd.get("umi_prefix"); opt.umi.skip = cmd.get("umi_skip"); + opt.umi.skipb = cmd.get("umi_skipb"); opt.umi.delimiter = cmd.get("umi_delim"); if(opt.umi.enabled) { string umiLoc = cmd.get("umi_loc"); diff --git a/src/options.h b/src/options.h index 57086b25..6ae23481 100644 --- a/src/options.h +++ b/src/options.h @@ -108,6 +108,7 @@ class UMIOptions { location = UMI_LOC_NONE; length = 0; skip = 0; + skipb = 0; delimiter= ":"; } public: @@ -115,6 +116,7 @@ class UMIOptions { int location; int length; int skip; + int skipb; string prefix; string separator; string delimiter; diff --git a/src/umiprocessor.cpp b/src/umiprocessor.cpp index 4c2c8295..a5b557df 100644 --- a/src/umiprocessor.cpp +++ b/src/umiprocessor.cpp @@ -18,12 +18,12 @@ void UmiProcessor::process(Read* r1, Read* r2) { else if(mOptions->umi.location == UMI_LOC_INDEX2 && r2) umi = r2->lastIndex(); else if(mOptions->umi.location == UMI_LOC_READ1){ - umi = r1->mSeq->substr(0, min(r1->length(), mOptions->umi.length)); - r1->trimFront(umi.length() + mOptions->umi.skip); + umi = r1->mSeq->substr(mOptions->umi.skipb, min(r1->length(), mOptions->umi.length)); + r1->trimFront(mOptions->umi.skipb + umi.length() + mOptions->umi.skip); } else if(mOptions->umi.location == UMI_LOC_READ2 && r2){ - umi = r2->mSeq->substr(0, min(r2->length(), mOptions->umi.length)); - r2->trimFront(umi.length() + mOptions->umi.skip); + umi = r2->mSeq->substr(mOptions->umi.skipb, min(r2->length(), mOptions->umi.length)); + r2->trimFront(mOptions->umi.skipb + umi.length() + mOptions->umi.skip); } else if(mOptions->umi.location == UMI_LOC_PER_INDEX){ string umiMerged = r1->firstIndex(); @@ -37,13 +37,13 @@ void UmiProcessor::process(Read* r1, Read* r2) { } } else if(mOptions->umi.location == UMI_LOC_PER_READ){ - string umi1 = r1->mSeq->substr(0, min(r1->length(), mOptions->umi.length)); + string umi1 = r1->mSeq->substr(mOptions->umi.skipb, min(r1->length(), mOptions->umi.length)); string umiMerged = umi1; - r1->trimFront(umi1.length() + mOptions->umi.skip); + r1->trimFront(mOptions->umi.skipb + umi1.length() + mOptions->umi.skip); if(r2){ - string umi2 = r2->mSeq->substr(0, min(r2->length(), mOptions->umi.length)); - umiMerged = umiMerged + "_" + umi2; - r2->trimFront(umi2.length() + mOptions->umi.skip); + string umi2 = r2->mSeq->substr(mOptions->umi.skipb, min(r2->length(), mOptions->umi.length)); + umiMerged = umiMerged + "+" + umi2; + r2->trimFront(mOptions->umi.skipb + umi2.length() + mOptions->umi.skip); } addUmiToName(r1, umiMerged); From 4e03707ee515dddf1ad7e2cb9983ac358c59b804 Mon Sep 17 00:00:00 2001 From: biolxy Date: Fri, 8 Mar 2024 17:29:08 +0800 Subject: [PATCH 2/3] Update umiprocessor.cpp --- src/umiprocessor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/umiprocessor.cpp b/src/umiprocessor.cpp index a5b557df..8b8b5483 100644 --- a/src/umiprocessor.cpp +++ b/src/umiprocessor.cpp @@ -42,7 +42,7 @@ void UmiProcessor::process(Read* r1, Read* r2) { r1->trimFront(mOptions->umi.skipb + umi1.length() + mOptions->umi.skip); if(r2){ string umi2 = r2->mSeq->substr(mOptions->umi.skipb, min(r2->length(), mOptions->umi.length)); - umiMerged = umiMerged + "+" + umi2; + umiMerged = umiMerged + "_" + umi2; r2->trimFront(mOptions->umi.skipb + umi2.length() + mOptions->umi.skip); } From 729ec931b3687ceac7331acf95fc16897f5595f8 Mon Sep 17 00:00:00 2001 From: biolxy Date: Tue, 26 Mar 2024 12:02:48 +0800 Subject: [PATCH 3/3] Update main.cpp Change the value range to 0 to 100 --- src/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 5978a4f7..be80c4e5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -127,8 +127,8 @@ int main(int argc, char* argv[]){ cmd.add("umi_loc", 0, "specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none", false, ""); cmd.add("umi_len", 0, "if the UMI is in read1/read2, its length should be provided", false, 0); cmd.add("umi_prefix", 0, "if specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default", false, ""); - cmd.add("umi_skip", 0, "if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0", false, 0, cmdline::range(1, 100)); - cmd.add("umi_skipb", 0, "if the UMI is in read1/read2, fastp can skip several bases before trim UMI, default is 0", false, 0, cmdline::range(1, 100)); + cmd.add("umi_skip", 0, "if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0", false, 0, cmdline::range(0, 100)); + cmd.add("umi_skipb", 0, "if the UMI is in read1/read2, fastp can skip several bases before trim UMI, default is 0", false, 0, cmdline::range(0, 100)); cmd.add("umi_delim", 0, "delimiter to use between the read name and the UMI, default is :", false, ":"); // overrepresented sequence analysis