1 /**
2  * FASTQ is a format for storing DNA sequences together with the associated
3  * quality information often encoded in ascii characters. It is typically made
4  * of 4 lines for example 2 fastq entries would look like this.
5  *
6  * @seq1
7  * TTATTTTAAT
8  * +
9  * ?+BBB/DHH@
10  * @seq2
11  * GACCCTTTGCA
12  * +
13  * ?+BHB/DIH@
14  *
15  * See_Also: https://en.wikipedia.org/wiki/FASTQ_format
16  * See_Also: http://forum.dlang.org/post/nd01qd$2k8c$1@digitalmars.com
17  */
18 module fastq;
19 
20 @safe pure nothrow @nogc:
21 
22 struct FastQRecord
23 {
24     // TODO `inout` support like in `rdf.d`
25     const(char)[] sequenceId;
26     const(char)[] sequenceLetters;
27     const(char)[] quality;
28 
29     static auto parse(const(char)[] from)
30     {
31         static struct Result
32         {
33         @safe pure nothrow:
34             private
35             {
36                 const(char)[] source;
37                 FastQRecord value;
38                 bool isEmpty;
39             }
40 
41             this(const(char)[] source)
42             {
43                 this.source = source;
44                 popFront;
45             }
46 
47             @property
48             {
49                 FastQRecord front()
50                 {
51                     return value;
52                 }
53 
54                 bool empty()
55                 {
56                     return isEmpty;
57                 }
58             }
59 
60             void popFront()
61             {
62                 import std..string : indexOf;
63 
64                 if (source is null)
65                 {
66                     isEmpty = true;
67                     return;
68                 }
69 
70                 void tidyInput()
71                 {
72                     foreach(i, c; source)
73                     {
74                         switch(c)
75                         {
76                         case 0: .. case ' ':
77                             break;
78                         default:
79                             source = source[i .. $];
80                             return;
81                         }
82                     }
83 
84                     source = null;
85                 }
86 
87                 tidyInput();
88 
89                 if (source is null)
90                     return;
91 
92                 // sequenceId
93 
94                 assert(source[0] == '@');
95 
96                 ptrdiff_t len = source.indexOf("\n");
97                 assert(len > 0);
98 
99                 value.sequenceId = source[1 .. len];
100                 if (value.sequenceId[$-1] == "\r"[0])
101                     value.sequenceId = value.sequenceId[0 .. $-1];
102 
103                 source = source[len + 1 .. $];
104 
105                 // sequenceLetters
106 
107                 len = source.indexOf("\n");
108                 assert(len > 0);
109 
110                 value.sequenceLetters = source[0 .. len];
111                 if (value.sequenceLetters[$-1] == "\r"[0])
112                     value.sequenceLetters = value.sequenceLetters[0 .. $-1];
113 
114                 source = source[len + 1 .. $];
115 
116                 // +sequenceId
117 
118                 len = source.indexOf("\n");
119                 assert(len > 0);
120                 source = source[len + 1 .. $];
121 
122                 // quality
123 
124                 len = source.indexOf("\n");
125                 assert(len > 0);
126 
127                 value.quality = source[0 .. len];
128                 if (value.quality[$-1] == "\r"[0])
129                     value.quality = value.quality[0 .. $-1];
130 
131                 if (source.length > len + 1)
132                 {
133                     source = source[len + 1 .. $];
134                     tidyInput();
135                 } else
136                     source = null;
137             }
138         }
139 
140         return Result(from);
141     }
142 }
143 
144 unittest
145 {
146     string input = `
147 @seq1
148 TTATTTTAAT
149 +
150 ?+BBB/DHH@
151 @seq2
152 GACCCTTTGCA
153 +
154 ?+BHB/DIH@
155 @SEQ_ID
156 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
157 +
158 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
159 `[1 .. $];
160     assert(equal(FastQRecord.parse(input),
161                  [FastQRecord("seq1", "TTATTTTAAT", "?+BBB/DHH@"),
162                   FastQRecord("seq2", "GACCCTTTGCA", "?+BHB/DIH@"),
163                   FastQRecord("SEQ_ID", "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT", "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65")].s[]));
164 }
165 
166 version(unittest)
167 {
168     import std.algorithm.comparison : equal;
169     import nxt.array_help : s;
170 }