1 /** 2 * FASTQ is a format for storing DNA sequences together with the associated 3 * quality information often encoded in ascii characters. It is typically made 4 * of 4 lines for example 2 fastq entries would look like this. 5 * 6 * @seq1 7 * TTATTTTAAT 8 * + 9 * ?+BBB/DHH@ 10 * @seq2 11 * GACCCTTTGCA 12 * + 13 * ?+BHB/DIH@ 14 * 15 * See_Also: https://en.wikipedia.org/wiki/FASTQ_format 16 * See_Also: http://forum.dlang.org/post/nd01qd$2k8c$1@digitalmars.com 17 */ 18 module fastq; 19 20 @safe pure nothrow @nogc: 21 22 struct FastQRecord 23 { 24 // TODO `inout` support like in `rdf.d` 25 const(char)[] sequenceId; 26 const(char)[] sequenceLetters; 27 const(char)[] quality; 28 29 static auto parse(const(char)[] from) 30 { 31 static struct Result 32 { 33 @safe pure nothrow: 34 private 35 { 36 const(char)[] source; 37 FastQRecord value; 38 bool isEmpty; 39 } 40 41 this(const(char)[] source) 42 { 43 this.source = source; 44 popFront; 45 } 46 47 @property 48 { 49 FastQRecord front() 50 { 51 return value; 52 } 53 54 bool empty() 55 { 56 return isEmpty; 57 } 58 } 59 60 void popFront() 61 { 62 import std..string : indexOf; 63 64 if (source is null) 65 { 66 isEmpty = true; 67 return; 68 } 69 70 void tidyInput() 71 { 72 foreach(i, c; source) 73 { 74 switch(c) 75 { 76 case 0: .. case ' ': 77 break; 78 default: 79 source = source[i .. $]; 80 return; 81 } 82 } 83 84 source = null; 85 } 86 87 tidyInput(); 88 89 if (source is null) 90 return; 91 92 // sequenceId 93 94 assert(source[0] == '@'); 95 96 ptrdiff_t len = source.indexOf("\n"); 97 assert(len > 0); 98 99 value.sequenceId = source[1 .. len]; 100 if (value.sequenceId[$-1] == "\r"[0]) 101 value.sequenceId = value.sequenceId[0 .. $-1]; 102 103 source = source[len + 1 .. $]; 104 105 // sequenceLetters 106 107 len = source.indexOf("\n"); 108 assert(len > 0); 109 110 value.sequenceLetters = source[0 .. len]; 111 if (value.sequenceLetters[$-1] == "\r"[0]) 112 value.sequenceLetters = value.sequenceLetters[0 .. $-1]; 113 114 source = source[len + 1 .. $]; 115 116 // +sequenceId 117 118 len = source.indexOf("\n"); 119 assert(len > 0); 120 source = source[len + 1 .. $]; 121 122 // quality 123 124 len = source.indexOf("\n"); 125 assert(len > 0); 126 127 value.quality = source[0 .. len]; 128 if (value.quality[$-1] == "\r"[0]) 129 value.quality = value.quality[0 .. $-1]; 130 131 if (source.length > len + 1) 132 { 133 source = source[len + 1 .. $]; 134 tidyInput(); 135 } else 136 source = null; 137 } 138 } 139 140 return Result(from); 141 } 142 } 143 144 unittest 145 { 146 string input = ` 147 @seq1 148 TTATTTTAAT 149 + 150 ?+BBB/DHH@ 151 @seq2 152 GACCCTTTGCA 153 + 154 ?+BHB/DIH@ 155 @SEQ_ID 156 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT 157 + 158 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 159 `[1 .. $]; 160 assert(equal(FastQRecord.parse(input), 161 [FastQRecord("seq1", "TTATTTTAAT", "?+BBB/DHH@"), 162 FastQRecord("seq2", "GACCCTTTGCA", "?+BHB/DIH@"), 163 FastQRecord("SEQ_ID", "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT", "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65")].s[])); 164 } 165 166 version(unittest) 167 { 168 import std.algorithm.comparison : equal; 169 import nxt.array_help : s; 170 }