Skip to content

Commit 2cb7b0e

Browse files
authored
Merge pull request #389 from AdamaJava/qbamfilter_speedup
perf(qbamfilter): minor speedups
2 parents d2d4aff + d5a1a0c commit 2cb7b0e

File tree

9 files changed

+224
-137
lines changed

9 files changed

+224
-137
lines changed

qbamfilter/src/org/qcmg/qbamfilter/filter/CigarFilter.java

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ public class CigarFilter implements SamRecordFilter{
1717
private final CigarOperator operator;
1818
private final int value;
1919
private final Comparator op;
20-
20+
private final boolean shortCutEnabled;
2121
/**
22-
* initilize cigar operator name, comparator and operator value
22+
* initialize cigar operator name, comparator and operator value
2323
*
2424
* @parm operatorName At moment the valid name are [M,I,D,N,S,H,P].
2525
* @param comp refer to valid comparator of org.qcmg.qbamfilter.filter.Comparator.
@@ -38,33 +38,35 @@ public CigarFilter(String operatorName, Comparator comp, String value )throws Ex
3838
else if(operatorName.equalsIgnoreCase("H")){operator = CigarOperator.H;}
3939
else if(operatorName.equalsIgnoreCase("P")){operator = CigarOperator.P;}
4040
else{
41-
throw new Exception("invaid Cigar String operator: " + operatorName + "in query condition Cigar_" + operatorName );
42-
}
41+
throw new Exception("invalid Cigar String operator: " + operatorName + "in query condition Cigar_" + operatorName );
42+
}
43+
shortCutEnabled = op == Comparator.GreatEqual || op == Comparator.Great;
4344
}
4445

4546
/**
4647
* check the record base length with required CigarOperator.
4748
* @param record: a SAMRecord
48-
* @return true if the length is satified by the condition
49+
* @return true if the length is satisfied by the condition
4950
* Usage example: if you want filter out all reads with matched base greater equal than 35mers.
5051
* SAMRecordFilter myfilter = new CIGARFilter("M", Comparator.GreatEqual, "35" );
5152
* if(myfilter.filterout(record) == true){ System.out.println(record.toString);}
5253
*/
5354
@Override
5455
public boolean filterOut(final SAMRecord record){
5556
Cigar cigar = record.getCigar();
56-
57-
int result = 0;
57+
int result = 0;
5858

5959
//eg. cigar = "25M5S15M", here r 2 operator "M" and 1 "S"
6060
for (CigarElement element : cigar.getCigarElements()) {
6161
if (operator == element.getOperator()) {
62-
result += element.getLength();
62+
result += element.getLength();
63+
// Early termination if result already exceeds the threshold
64+
if (shortCutEnabled && result > value) {
65+
return true; // op.eval(result, value) will always return true
66+
}
6367
}
6468
}
65-
6669
return op.eval(result, value );
67-
6870
}
6971

7072
/**

qbamfilter/src/org/qcmg/qbamfilter/filter/Comparator.java

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,22 +43,21 @@ public enum Comparator {
4343
* return true if the String v1 doesn't contain String v2 for Comparator NotContain
4444
*/
4545
public boolean eval(String v1, String v2) {
46-
switch(this){
47-
case GreatEqual: return Integer.parseInt(v1) >= Integer.parseInt(v2);
48-
case SmallEqual: return Integer.parseInt(v1) <= Integer.parseInt(v2);
49-
case Great: return Integer.parseInt(v1) > Integer.parseInt(v2);
50-
case Small: return Integer.parseInt(v1) < Integer.parseInt(v2);
51-
case Equal: return v1.equalsIgnoreCase(v2);
52-
case NotEqual: return ! v1.equalsIgnoreCase(v2);
53-
case StartWith: return v1.toLowerCase().startsWith(v2.toLowerCase());
54-
case NotStartWith: return ! v1.toLowerCase().startsWith(v2.toLowerCase());
55-
case EndWith: return v1.toLowerCase().endsWith(v2.toLowerCase());
56-
case NotEndWith: return ! v1.toLowerCase().endsWith(v2.toLowerCase());
57-
case Contain: return v1.toLowerCase().contains(v2.toLowerCase());
58-
case NotContain: return ! v1.toLowerCase().contains(v2.toLowerCase());
59-
}
46+
return switch (this) {
47+
case GreatEqual -> Integer.parseInt(v1) >= Integer.parseInt(v2);
48+
case SmallEqual -> Integer.parseInt(v1) <= Integer.parseInt(v2);
49+
case Great -> Integer.parseInt(v1) > Integer.parseInt(v2);
50+
case Small -> Integer.parseInt(v1) < Integer.parseInt(v2);
51+
case Equal -> v1.equalsIgnoreCase(v2);
52+
case NotEqual -> !v1.equalsIgnoreCase(v2);
53+
case StartWith -> v1.toLowerCase().startsWith(v2.toLowerCase());
54+
case NotStartWith -> !v1.toLowerCase().startsWith(v2.toLowerCase());
55+
case EndWith -> v1.toLowerCase().endsWith(v2.toLowerCase());
56+
case NotEndWith -> !v1.toLowerCase().endsWith(v2.toLowerCase());
57+
case Contain -> v1.toLowerCase().contains(v2.toLowerCase());
58+
case NotContain -> !v1.toLowerCase().contains(v2.toLowerCase());
59+
};
6060

61-
throw new AssertionError("Unknown comparator mark:" + this);
6261
}
6362

6463

@@ -151,7 +150,7 @@ public boolean eval(boolean v1, boolean v2){
151150
* @param comp: valid string parameter must belong to [">=", ">", "<=", "<", "==", "!="]
152151
* @return one of the six Comparators based on the parameter string comp
153152
*/
154-
public static Comparator GetComparator(String comp, String value) {
153+
public static Comparator getComparator(String comp, String value) {
155154

156155
return switch (comp) {
157156
case ">=" -> GreatEqual;

qbamfilter/src/org/qcmg/qbamfilter/filter/MDFilter.java

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,55 +7,70 @@
77
package org.qcmg.qbamfilter.filter;
88

99
import htsjdk.samtools.SAMRecord;
10+
import htsjdk.samtools.SAMTag;
1011
import htsjdk.samtools.filter.SamRecordFilter;
1112

13+
import java.util.BitSet;
14+
1215
public class MDFilter implements SamRecordFilter{
1316
private final boolean mismatchFilter;
1417
private final int value;
1518
private final Comparator op;
19+
public static final short MD_TAG = SAMTag.makeBinaryTag("MD");
1620

1721
/**
18-
* Initialise cigar operator name, comparator and operator value
19-
* @param operatorName : At moment the valid name is "mismatch".
20-
* @param comp: see details of valid comparator on org.qcmg.qbamfilter.filter.Comparator.
21-
* @param value: a integer string.
22-
* @throws Exception
23-
* See usage on method filterout.
22+
* Constructs an MDFilter object to filter SAM records based on the mismatch condition in the MD field.
23+
* The filter checks whether the specified mismatch condition satisfies the given comparison criteria.
24+
*
25+
* @param operatorName The name of the operator being used. Only "mismatch" is valid for this filter.
26+
* @param comp The comparator defining the condition to be applied. E.g., GreaterEqual, LessThan, etc.
27+
* @param value A string representing the numeric threshold for the mismatch condition. Must be a valid integer.
28+
* @throws Exception If the value is not a valid integer or if an invalid operator is provided.
2429
*/
2530
public MDFilter (String operatorName, Comparator comp, String value ) throws Exception {
2631
try {
2732
this.value = Integer.parseInt(value);
2833
} catch(Exception e) {
29-
throw new Exception("non integer value used in DM field filter: MD_" +operatorName + comp.getString() + value);
34+
throw new Exception("non integer value used in MD field filter: MD_" +operatorName + comp.getString() + value);
3035
}
3136
op = comp;
3237
if (operatorName.equalsIgnoreCase("mismatch")){
3338
mismatchFilter = true;
3439
} else {
35-
throw new Exception("invalid MD String operator: " + operatorName + "in query condition Cigar_" + operatorName );
40+
throw new Exception("invalid MD String operator: " + operatorName + "in query condition MD_" + operatorName );
3641
}
3742
}
38-
3943

40-
private static int tallyMDMismatches(String mdData) {
44+
public static int tallyMDMismatches(String mdData) {
45+
if (mdData == null || mdData.isEmpty()) {
46+
return 0;
47+
}
48+
4149
int count = 0;
42-
if (null != mdData) {
43-
for (int i = 0, size = mdData.length() ; i < size ; ) {
44-
char c = mdData.charAt(i);
45-
if (isValidMismatch(c)) {
46-
count++;
50+
int size = mdData.length();
51+
int i = 0;
52+
53+
while (i < size) {
54+
char c = mdData.charAt(i);
55+
56+
if (Character.isDigit(c)) {
57+
i++;
58+
while (i < size && Character.isDigit(mdData.charAt(i))) {
59+
i++;
60+
}
61+
} else if (c == 'A' || c == 'C' || c == 'G' || c == 'T' || c == 'N') {
62+
count++;
63+
i++;
64+
} else if (c == '^') {
65+
// Skip the segment after '^' (indicating a deletion)
66+
i++;
67+
while (i < size && Character.isLetter(mdData.charAt(i))) {
4768
i++;
48-
} else if ('^' == c) {
49-
while (++i < size && Character.isLetter(mdData.charAt(i))) {}
50-
} else i++; // need to increment this or could end up with infinite loop...
69+
}
5170
}
5271
}
5372
return count;
5473
}
55-
56-
private static boolean isValidMismatch(char c) {
57-
return c == 'A' || c == 'C' || c == 'G' || c == 'T' || c == 'N';
58-
}
5974

6075
/**
6176
* check the record base length with required CigarOperator.
@@ -67,7 +82,7 @@ private static boolean isValidMismatch(char c) {
6782
*/
6883
@Override
6984
public boolean filterOut(final SAMRecord record){
70-
String attribute = (String)record.getAttribute("MD");
85+
String attribute = (String)record.getAttribute(MD_TAG);
7186

7287
if (attribute == null) {
7388
return false;
@@ -88,5 +103,4 @@ public boolean filterOut(SAMRecord arg0, SAMRecord arg1) {
88103
// TODO Auto-generated method stub
89104
return false;
90105
}
91-
92106
}

qbamfilter/src/org/qcmg/qbamfilter/filter/TagValueFilter.java

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,46 +6,44 @@
66
*/
77
package org.qcmg.qbamfilter.filter;
88

9+
import htsjdk.samtools.SAMTag;
910
import htsjdk.samtools.filter.SamRecordFilter;
1011
import htsjdk.samtools.SAMRecord;
1112
import htsjdk.samtools.SAMTagUtil;
1213

1314
public final class TagValueFilter implements SamRecordFilter{
1415

15-
private static SAMTagUtil stu = SAMTagUtil.getSingleton();
1616
private final short tagShort;
1717
private final String value;
1818
private final Comparator op;
1919

2020

2121
/**
2222
* initialise optional field name, comparator and field value
23-
* @param tag : the optional field name,it will be convert to upper case automatically.
23+
* @param tag : the optional field name,it will be converted to upper case automatically.
2424
* @param comp: see details of valid comparator on org.qcmg.qbamfilter.filter.Comparator.
2525
* @param value: a string value.
2626
* See usage on method filterOut.
2727
*/
2828
public TagValueFilter(String tag, Comparator comp, String value ){
29-
tagShort = stu.makeBinaryTag(tag);
29+
tagShort = SAMTag.makeBinaryTag(tag);
3030
this.value = value;
3131
op = comp;
3232
}
3333

3434
/**
35-
* check the optional filed in SAMRecord. return true if that field value is satified by the condition
35+
* check the optional filed in SAMRecord. return true if that field value is satisfied by the condition
3636
* @param record: a SAMRecord
37-
* @return true if this potional field is satisfied with the query
37+
* @return true if this optional field is satisfied with the query
3838
* Usage example: if you want filter out all reads with field "ZM",and its value is one.
3939
* CigarFilter myfilter = new TagValueFilter("ZM",Comparator.Equal, "1" );
4040
* if(myfilter.filterout(record) == true){ System.out.println(record.toString);}
4141
*/
4242
@Override
4343
public boolean filterOut(final SAMRecord record){
44-
//if that tag fileld is not exists, it return null
4544
Object ob = record.getAttribute(tagShort);
46-
4745
if (ob != null) {
48-
return op.eval(ob.toString(),value );
46+
return op.eval(ob.toString(), value);
4947
}
5048
return false;
5149
}

qbamfilter/src/org/qcmg/qbamfilter/grammars/Condition.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@ public class Condition {
5656
else
5757
this.value = value;
5858

59-
op = Comparator.GetComparator(comp, value);
59+
op = Comparator.getComparator(comp, value);
6060

6161
if(op == null)
62-
throw new Exception(String.format("invalide condition in query: %s %s %s ", key, comp, value));
62+
throw new Exception(String.format("invalid condition in query: %s %s %s ", key, comp, value));
6363

6464
}
6565

qbamfilter/src/org/qcmg/qbamfilter/grammars/Expression.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,16 @@ public void addOperator(queryTree.Operator op){
4343
public boolean filterOut(final SAMRecord record){
4444

4545
if(operator == queryTree.Operator.AND){
46-
for(int i = 0; i < conditions.size(); i ++){
47-
if( ! conditions.get(i).filterOut(record))
48-
return false;
49-
}
46+
for (SamRecordFilter condition : conditions) {
47+
if (!condition.filterOut(record))
48+
return false;
49+
}
5050
return true;
5151
}else{ // case of OR
52-
for(int i = 0; i < conditions.size(); i ++){
53-
if(conditions.get(i).filterOut(record))
54-
return true;
55-
}
52+
for (SamRecordFilter condition : conditions) {
53+
if (condition.filterOut(record))
54+
return true;
55+
}
5656
return false;
5757
}
5858
}

0 commit comments

Comments
 (0)