nxt.tabular source code

1 /**
2    Generic Loader for delimited text files.
3 
4    $(LREF tabular) is the main function to be used.
5 
6    Copyright: Copyright 2013 the authors.
7 
8    License: BSD 3-Clause
9 
10    Authors: $(WEB https://github.com/agordon/ , A. Gordon), JM
11 */
12 module nxt.tabular;
13 
14 // import std.typetuple;
15 import std.traits: isNumeric, Select;
16 import std.typecons: Tuple, tuple, isTuple;
17 import std.functional: unaryFun;
18 import std.string: translate;
19 // import std.array;
20 import std.conv: text;
21 import std.exception: assertThrown;
22 import std.stdio: File;
23 import std.file: FileException;
24 import std.range;
25 
26 private
27 {
28     @safe pure void consumeDelimiter(S, D)(ref S inputString, const D delimiter)
29     {
30         if (inputString.empty || inputString[0] != delimiter)
31 	    throw new Exception("missing delimiter");
32 
33         inputString = inputString[1..$];
34     }
35 
36     unittest
37     {
38 	string s = "\t2\t3";
39 	consumeDelimiter(s,'\t');
40 	assert(s=="2\t3");
41 	//Trying to remove a delimiter when non is available is a throwable offense
42 	assertThrown!Exception(consumeDelimiter(s,'\t'));
43 	//Trying to remove a delimiter from an empty string is a throwable offense
44 	s = "";
45 	assertThrown!Exception(consumeDelimiter(s,' '));
46     }
47 
48     @safe S consumeStringField(S,D)(ref S inputString, const D delimiter)
49     {
50 	size_t j = inputString.length;
51 	foreach (i, dchar c; inputString)
52 	{
53             if ( c == delimiter )
54             {
55                 j = i;
56                 break;
57             }
58 	}
59 	scope(exit) inputString = inputString[j .. $];
60 	return inputString[0 .. j];
61     }
62 
63     unittest
64     {
65 	// Consume the first field
66 	string s = "hello\tworld";
67 	string t = consumeStringField(s,'\t');
68 	assert(s=="\tworld");
69 	assert(t=="hello");
70 
71 	// Consume the next (and last) field
72 	consumeDelimiter(s,'\t');
73 	t = consumeStringField(s,'\t');
74 	assert(s=="");
75 	assert(t=="world");
76 
77 	// No string before delimiter - return an empty string
78 	s = "\tfoo\tbar";
79 	t = consumeStringField(s,'\t');
80 	assert(s=="\tfoo\tbar");
81 	assert(t=="");
82 
83 	// Empty string - is a valid single (empty) field
84 	s = "";
85 	t = consumeStringField(s,'\t');
86 	assert(s=="");
87 	assert(t=="");
88 
89 	// No delimiter in string - treat it as a valid single field
90 	s = "hello world";
91 	t = consumeStringField(s,'\t');
92 	assert(s=="");
93 	assert(t=="hello world");
94     }
95 
96     @safe pure S quotemeta(S)(const S s)
97     {
98 	string[dchar] meta = [ '\n' : "<LF>",
99                                '\t' : "<TAB>",
100                                '\r' : "<CR>",
101                                '\0' : "<NULL>" ];
102 
103 	return translate(s,meta);
104     }
105 
106     unittest
107     {
108 	string s="1\t2\t3\n";
109 	auto t = quotemeta(s);
110 	assert(t=="1<TAB>2<TAB>3<LF>");
111 
112 	//String with null
113 	s="1\0002";
114 	t = quotemeta(s);
115 	assert(t=="1<NULL>2");
116 
117 	//Empty string
118 	s="";
119 	t = quotemeta(s);
120 	assert(t=="");
121 
122 	// Normal string
123 	s="1\\t2";
124 	t = quotemeta(s);
125 	assert(t=="1\\t2");
126     }
127 
128     @safe pure string quotemeta(const char c)
129     {
130 	string[dchar] meta = [ '\n' : "<LF>",
131                                '\t' : "<TAB>",
132                                '\r' : "<CR>",
133                                '\0' : "<NULL>" ];
134 	if (c in meta)
135             return meta[c];
136 
137 	return [c];
138     }
139 
140     unittest
141     {
142 	assert(quotemeta('\t')=="<TAB>");
143 	assert(quotemeta('\r')=="<CR>");
144 	assert(quotemeta('\n')=="<LF>");
145 	assert(quotemeta('\00')=="<NULL>");
146 	assert(quotemeta('t')=="t");
147     }
148 
149 } // private
150 
151 
152 /**
153    Parses string $(D input), delimited by character $(D delimiter), into a tuple of variables $(arg).
154 
155    Returns:
156    On success, the function returns nothing (void), and all the members of the tuple are populated.
157 
158    Throws:
159    $(XREF std.exception.Exception) on failure to correctly parse the string.
160 
161    Example:
162    ----
163    string s = "Hello World 42";
164    Tuple!(string,string,int) t;
165    parseDelimited(s,' ',t);
166    assert(t[0]=="Hello");
167    assert(t[1]=="World");
168    assert(t[2]==42);
169    ----
170 
171    Notes:
172    $(OL
173    $(LI Parsing is much stricter (and less tolerant) than $(XREF std.format.formattedRead))
174    $(LI White-space is never automatically skipped)
175    $(LI A space delimiter consume only space character (ASCII 20), not TAB (ASCII 9))
176    $(LI Multiple consecutive delimiters are not consumed as one delimiter (e.g. "1\t\t\t2" is considerd a string with four fields - it has three delimiters. It will throw an exception because empty fields are not allowed).)
177    $(LI All fields must exist (i.e. if the tuple $(D arg) has 3 members, the $(D input) string must contain two delimiters and three valid values))
178    $(LI For a string field, empty values are not acceptable, will throw an exception)
179    $(LI Extra characters at the end of a field or the line will throw an exception)
180    )
181 
182 */
183 @safe void parseDelimited(Data)(const string input,
184                                 const char delimiter,
185                                 ref Data arg)
186 {
187     string remainingInput = input;
188 
189     foreach (i, T; Data.Types)
190     {
191         //TODO: Handle other types (for now, only numeric or strings)
192         static if (isNumeric!T)
193         {
194             try
195             {
196                 // consume a numeric field
197                 static import std.conv;
198                 arg[i] = std.conv.parse!T(remainingInput);
199             }
200             catch ( std.conv.ConvException e )
201             {
202                 throw new Exception(text("failed to parse numeric value in field ", i+1,
203                                          " (text is '",quotemeta(remainingInput),"')"));
204             }
205         }
206         else
207  	{
208             // consume a string field
209             arg[i] = consumeStringField(remainingInput,delimiter);
210             if (arg[i].empty)
211                 throw new Exception(text("empty text at field ", i+1,
212                                          " (remaining text is '",quotemeta(remainingInput),"')"));
213         }
214 
215         static if (i<Data.length-1)
216         {
217             //Not the last field - require more input
218             if (remainingInput.empty)
219                 throw new Exception(text("input terminated too soon (expecting ",
220                                          Data.length," fields, got ", i+1, ")"));
221 
222             //Following the converted value of this field,
223             //require a delimiter (to prevent extra characters, even whitespace)
224             if (remainingInput[0] != delimiter)
225                 throw new Exception(text("extra characters in field ",i+1,
226                                          " (starting at '",quotemeta(remainingInput),"')"));
227             consumeDelimiter(remainingInput,delimiter);
228         }
229         else
230         {
231             // Last field: check for extra input
232             if (!remainingInput.empty)
233                 throw new Exception(text("extra characters in last field ",i+1,
234                                          " (starting at '",quotemeta(remainingInput),"')"));
235         }
236 
237     }
238 }
239 
240 unittest
241 {
242     Tuple!(int,string,int) a;
243     parseDelimited("1 2 3",' ',a);
244     assert(a[0]==1 && a[1]=="2" && a[2]==3);
245 
246     parseDelimited("1\t2\t3",'\t',a);
247     assert(a[0]==1 && a[1]=="2" && a[2]==3);
248 
249     //Extra delimiter at the end of the line is not OK
250     assertThrown!Exception(parseDelimited("1 2 3 ",' ',a));
251 
252     //Invalid number on first field (parse!int should fail)
253     assertThrown!Exception(parseDelimited(".1 2 3",' ',a));
254 
255     //Extra characters in field 1 (After successfull parse!int)
256     assertThrown!Exception(parseDelimited("1. 2 3",' ',a));
257 
258     //Line contains too many fields
259     assertThrown!Exception(parseDelimited("1 2 3 4",' ',a));
260 
261     //Line is too short
262     assertThrown!Exception(parseDelimited("1 2",' ',a));
263 
264     //non-space/tab delimiter is fine
265     parseDelimited("1|2|3",'|',a);
266     assert(a[0]==1 && a[1]=="2" && a[2]==3);
267     parseDelimited("1|  2  |3",'|',a);
268     assert(a[0]==1 && a[1]=="  2  " && a[2]==3);
269 
270     //Spaces are bad (and not ignored) if delimiter is not space (for numeric fields)
271     assertThrown!Exception(parseDelimited("1 |2|3",'|',a));
272     assertThrown!Exception(parseDelimited(" 1|2|3",'|',a));
273     assertThrown!Exception(parseDelimited(" 1|2| 3",'|',a));
274     assertThrown!Exception(parseDelimited("1|2|3 ",'|',a));
275 
276     //For string fields, empty values are not OK (different from formattedRead())
277     assertThrown!Exception(parseDelimited("1||3",'|',a));
278 
279     //For string fields, last value can't be empty (different from formattedRead())
280     Tuple!(int,string,string) b;
281     assertThrown!Exception(parseDelimited("1|2|",'|',b));
282 
283     //One field is OK
284     Tuple!(string) c;
285     parseDelimited("foo",' ',c);
286     assert(c[0]=="foo");
287 
288     //Fields that are OK for floating-point types should not work for integers (extra characters)
289     Tuple!(real,int) d;
290     parseDelimited("4.5 9",' ',d);
291     assert(d[0]==4.5 && d[1]==9);
292     Tuple!(int,real) e;
293     assertThrown!Exception(parseDelimited("4.5 9",' ',e));
294 
295     //scientific notation - OK for floating-point types
296     Tuple!(double,double) f;
297     parseDelimited("-0.004e3 +4.3e10",' ',f);
298     assert(f[0]==-0.004e3 && f[1]==43e9);
299 
300     //Scientific notation - fails for integars
301     Tuple!(int,int) g;
302     assertThrown!Exception(parseDelimited("-0.004e3 +4.3e10",' ',g));
303 }
304 
305 
306 /**
307    Loads a delimited text file, line-by-line, parses the line into fields, and calls a delegate/function for each line.
308 
309    Returns:
310    On success, the function returns nothing (void), the call back function have been called for every line.
311 
312    Throws:
313    $(XREF std.exception.Exception) on failure to correctly parse a line.
314    $(XREF std.file.FileException) on I/O failures.
315 
316    Example:
317    ----
318 // Load a text file with three numeric columns,
319 // Store the tuple in an array
320 // (NOTE: this is a naive, inefficient way to populate an array, see NOTES)
321 alias Tuple!(int,int,int) T;
322 T[] t;
323 tabular!( T,           // The number and types of the (expected) fields in the file
324 delegate(x)
325 { t ~= x; }, // for each line read, call this function. X will be of type T.
326 '\t'         // The delimiter (default = TAB)
327 )("file.txt"); // The file name to read.
328 ----
329 
330 Example:
331 ----
332 // Load a text file with three numeric columns,
333 // Use the second column as a KEY and the third column as the VALUE.
334 alias Tuple!(int,int,int) T;
335 int[int] data;
336 tabular!( T,              // The number and types of the (expected) fields in the file
337 delegate(x)
338 {   // for each line read, call this function. X will be of type T.
339 data[x[1]] = x[2] ;
340 },
341 '\t'             // The delimiter (default = TAB)
342 )("file.txt");    // The file name to read.
343 ----
344 
345 Notes:
346 $(OL
347 $(LI See $(LREF parseDelimited) for details about parsing the delimited lines of the fiile)
348 $(LO
349 )
350 
351 TODO: Make this an InputRange
352 
353 */
354 void tabular(Members, alias storeFunction, char delimiter='\t')(const string filename)
355 {
356     static assert (isTuple!Members,"tabular: 1st template parameter must be a Tuple with the expected columns in the file");
357 
358     auto f = File(filename);
359     scope(exit) f.close();
360     auto lines=0;
361 
362     alias unaryFun!storeFunction _Fun;
363     Members data;
364 
365     import nxt.bylinefast: byLineFast;
366     foreach (origline; f.byLineFast())
367     {
368         ++lines;
369         string line = origline.idup;
370         try
371         {
372             parseDelimited(line, delimiter, data);
373             _Fun(data);
374         }
375         catch ( Exception e )
376         {
377             throw new FileException(filename,text("invalid input at line ", lines,
378                                                   ": expected ", data.tupleof.length,
379                                                   " fields ",typeof(data.tupleof).stringof,
380                                                   " delimiter by '",quotemeta(delimiter),
381                                                   "' got '", origline,
382                                                   "' error details: ", e.msg ));
383         }
384     }
385 }
386 
387 unittest
388 {
389     import std.file ;
390     auto deleteme = testFilename();
391     write(deleteme,"1 2 3\n4 5 6\n");
392     scope(exit)
393     { assert(exists(deleteme)); remove(deleteme); }
394 
395     //Load a text file, with three fields, delimiter with spaces.
396     alias Tuple!(int,int,int) T;
397     T[] t;
398     tabular!( T,         // The number and types of the (expected) fields in the file
399              delegate(x)
400              { t ~= x; }, // for each line read, call this function. X will be of type T.
401              ' '        // The delimiter (default = TAB)
402         )(deleteme); // The file name to read.
403 	assert(t.length==2);
404 	assert(t[0] == tuple(1,2,3));
405 	assert(t[1] == tuple(4,5,6));
406 
407 	//Any kind of invalid data should throw an exception
408 	//NOTE: the delegate function does nothing, because we don't care about the data
409 	//      in this test.
410 	//NOTE: see more test cases for failed parsing in the unittest of 'parseDelimited'.
411 	auto deleteme2 = testFilename() ~ ".2";
412 	write(deleteme2,"1 Foo 3\n4 5 6\n"); // conversion will fail in the first line
413 	scope(exit)
414         { assert(exists(deleteme2)); remove(deleteme2); }
415 	assertThrown!Exception( tabular!( T, (x) => {}, ' ')(deleteme2)) ;
416 }
417 
418 /**
419 Loads a delimited text file, line-by-line, parses the line into fields, returns an array of fields.
420 
421 Returns:
422 On success, returns an array of tuples, based on template parameters.
423 
424 Throws:
425 $(XREF std.exception.Exception) on failure to correctly parse a line.
426 $(XREF std.file.FileException) on I/O failures.
427 
428 Example:
429 ----
430 // Load a text file, tab-delimited, with three numeric columns.
431 
432 auto data = tabularArray!('\t', int,int,int)("file.txt");
433 
434 // data[0] will be of type Tuple!(int,int,int)
435 ----
436 */
437 Select!(Types.length == 1, Types[0][], Tuple!(Types)[])
438 tabularArray(char delimiter, Types...)(string filename)
439 {
440     alias RetT = typeof(return);
441 
442     RetT result;
443     Appender!RetT app;
444     alias Members = ElementType!RetT;
445 
446     tabular! ( Members, x => app.put(x) , delimiter ) (filename);
447 
448     return app.data;
449 }
450 
451 unittest
452 {
453     import std.file ;
454     auto deleteme = testFilename() ~ ".3";
455     write(deleteme,"1 2 3\n4 5 6\n");
456     scope(exit)
457     { assert(exists(deleteme)); remove(deleteme); }
458 
459     //Load a text file, with three fields, delimiter with spaces.
460     auto t = tabularArray!( ' ', // delimiter
461                             int, int, int // expected fields in the text file
462         )(deleteme);
463     assert(t.length==2);
464     assert(t[0] == tuple(1,2,3));
465     assert(t[1] == tuple(4,5,6));
466 }
467 
468 version(unittest) string testFilename(string file = __FILE__, size_t line = __LINE__)
469 {
470 	import std.path;
471 	import std.process: thisProcessID;
472 	return text("deleteme-.", thisProcessID(), ".", baseName(file), ".", line);
473 }
474 
475 /*
476 On Thursday, 16 May 2013 at 10:35:12 UTC, Dicebot wrote:
477 > Want to bring into discussion people that are not on Google+.
478 > Samuel recently has posted there some simple experiments with
479 > bioinformatics and bad performance of Phobos-based snippet has
480 > surprised me.
481 >
482 > I did explore issue a bit and reported results in a blog post
483 > (snippets are really small and simple) :
484 > http://dicebot.blogspot.com/2013/05/short-performance-tuning-story.html
485 >
486 > One open question remains though - can D/Phobos do better here?
487 > Can some changes be done to Phobos functions in question to
488 > improve performance or creating bioinformatics-specialized
489 > library is only practical solution?
490 
491 I bet the problem is in readln. Currently, File.byLine() and
492 readln() are extremely slow, because they call fgetc() one char
493 at a time.
494 
495 I made an "byLineFast" implementation some time ago that is 10x
496 faster than std.stdio.byLine. It reads lines through rawRead, and
497 using buffers instead of char by char.
498 
499 I don't have the time to make it phobos-ready (unicode, etc.).
500 But I'll paste it here for any one to use (it works perfectly).
501 
502 --jm
503 */