1 /**
2  * FASTQ is a format for storing DNA sequences together with the associated
3  * quality information often encoded in ascii characters. It is typically made
4  * of 4 lines for example 2 fastq entries would look like this.
5  *
6  * @seq1
7  * TTATTTTAAT
8  * +
9  * ?+BBB/DHH@
10  * @seq2
11  * GACCCTTTGCA
12  * +
13  * ?+BHB/DIH@
14  *
15  * See_Also: https://en.wikipedia.org/wiki/FASTQ_format
16  * See_Also: http://forum.dlang.org/post/nd01qd$2k8c$1@digitalmars.com
17  */
18 module fastq;
19 
20 pure nothrow @safe @nogc:
21 
22 struct FastQRecord
23 {
24 	/+ TODO: `inout` support like in `rdf.d` +/
25 	const(char)[] sequenceId;
26 	const(char)[] sequenceLetters;
27 	const(char)[] quality;
28 
29 	static auto parse(const(char)[] from)
30 	{
31 		static struct Result
32 		{
33 		@safe pure nothrow:
34 			private
35 			{
36 				const(char)[] source;
37 				FastQRecord value;
38 				bool isEmpty;
39 			}
40 
41 			this(const(char)[] source)
42 			{
43 				this.source = source;
44 				popFront;
45 			}
46 
47 			@property
48 			{
49 				FastQRecord front()
50 				{
51 					return value;
52 				}
53 
54 				bool empty()
55 				{
56 					return isEmpty;
57 				}
58 			}
59 
60 			void popFront()
61 			{
62 				import std.string : indexOf;
63 
64 				if (source is null)
65 				{
66 					isEmpty = true;
67 					return;
68 				}
69 
70 				void tidyInput()
71 				{
72 					foreach(i, c; source)
73 					{
74 						switch(c)
75 						{
76 						case 0: .. case ' ':
77 							break;
78 						default:
79 							source = source[i .. $];
80 							return;
81 						}
82 					}
83 
84 					source = null;
85 				}
86 
87 				tidyInput();
88 
89 				if (source is null)
90 					return;
91 
92 				// sequenceId
93 
94 				assert(source[0] == '@');
95 
96 				ptrdiff_t len = source.indexOf("\n");
97 				assert(len > 0);
98 
99 				value.sequenceId = source[1 .. len];
100 				if (value.sequenceId[$-1] == "\r"[0])
101 					value.sequenceId = value.sequenceId[0 .. $-1];
102 
103 				source = source[len + 1 .. $];
104 
105 				// sequenceLetters
106 
107 				len = source.indexOf("\n");
108 				assert(len > 0);
109 
110 				value.sequenceLetters = source[0 .. len];
111 				if (value.sequenceLetters[$-1] == "\r"[0])
112 					value.sequenceLetters = value.sequenceLetters[0 .. $-1];
113 
114 				source = source[len + 1 .. $];
115 
116 				// +sequenceId
117 
118 				len = source.indexOf("\n");
119 				assert(len > 0);
120 				source = source[len + 1 .. $];
121 
122 				// quality
123 
124 				len = source.indexOf("\n");
125 				assert(len > 0);
126 
127 				value.quality = source[0 .. len];
128 				if (value.quality[$-1] == "\r"[0])
129 					value.quality = value.quality[0 .. $-1];
130 
131 				if (source.length > len + 1)
132 				{
133 					source = source[len + 1 .. $];
134 					tidyInput();
135 				} else
136 					source = null;
137 			}
138 		}
139 
140 		return Result(from);
141 	}
142 }
143 
144 unittest {
145 	string input = `
146 @seq1
147 TTATTTTAAT
148 +
149 ?+BBB/DHH@
150 @seq2
151 GACCCTTTGCA
152 +
153 ?+BHB/DIH@
154 @SEQ_ID
155 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
156 +
157 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
158 `[1 .. $];
159 	assert(equal(FastQRecord.parse(input),
160 				 [FastQRecord("seq1", "TTATTTTAAT", "?+BBB/DHH@"),
161 				  FastQRecord("seq2", "GACCCTTTGCA", "?+BHB/DIH@"),
162 				  FastQRecord("SEQ_ID", "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT", "!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65")].s[]));
163 }
164 
165 version (unittest)
166 {
167 	import std.algorithm.comparison : equal;
168 	import nxt.array_help : s;
169 }