1 /**
2    Generic Loader for delimited text files.
3 
4    $(LREF tabular) is the main function to be used.
5 
6    Copyright: Copyright 2013 the authors.
7 
8    License: BSD 3-Clause
9 
10    Authors: $(WEB https://github.com/agordon/ , A. Gordon), JM
11 */
12 module nxt.tabular;
13 
14 // import std.typetuple;
15 import std.traits: isNumeric, Select;
16 import std.typecons: Tuple, tuple, isTuple;
17 import std.functional: unaryFun;
18 import std.string: translate;
19 // import std.array;
20 import std.conv: text;
21 import std.exception: assertThrown;
22 import std.stdio: File;
23 import std.file: FileException;
24 import std.range;
25 
26 private
27 {
28 	@safe pure void consumeDelimiter(S, D)(ref S inputString, const D delimiter)
29 	{
30 		if (inputString.empty || inputString[0] != delimiter)
31 		throw new Exception("missing delimiter");
32 
33 		inputString = inputString[1..$];
34 	}
35 
36 	unittest
37 	{
38 	string s = "\t2\t3";
39 	consumeDelimiter(s,'\t');
40 	assert(s=="2\t3");
41 	//Trying to remove a delimiter when non is available is a throwable offense
42 	assertThrown!Exception(consumeDelimiter(s,'\t'));
43 	//Trying to remove a delimiter from an empty string is a throwable offense
44 	s = "";
45 	assertThrown!Exception(consumeDelimiter(s,' '));
46 	}
47 
48 	@safe S consumeStringField(S,D)(ref S inputString, const D delimiter)
49 	{
50 	size_t j = inputString.length;
51 	foreach (i, dchar c; inputString)
52 	{
53 			if ( c == delimiter )
54 			{
55 				j = i;
56 				break;
57 			}
58 	}
59 	scope(exit) inputString = inputString[j .. $];
60 	return inputString[0 .. j];
61 	}
62 
63 	unittest
64 	{
65 	// Consume the first field
66 	string s = "hello\tworld";
67 	string t = consumeStringField(s,'\t');
68 	assert(s=="\tworld");
69 	assert(t=="hello");
70 
71 	// Consume the next (and last) field
72 	consumeDelimiter(s,'\t');
73 	t = consumeStringField(s,'\t');
74 	assert(s=="");
75 	assert(t=="world");
76 
77 	// No string before delimiter - return an empty string
78 	s = "\tfoo\tbar";
79 	t = consumeStringField(s,'\t');
80 	assert(s=="\tfoo\tbar");
81 	assert(t=="");
82 
83 	// Empty string - is a valid single (empty) field
84 	s = "";
85 	t = consumeStringField(s,'\t');
86 	assert(s=="");
87 	assert(t=="");
88 
89 	// No delimiter in string - treat it as a valid single field
90 	s = "hello world";
91 	t = consumeStringField(s,'\t');
92 	assert(s=="");
93 	assert(t=="hello world");
94 	}
95 
96 	@safe pure S quotemeta(S)(const S s)
97 	{
98 	string[dchar] meta = [ '\n' : "<LF>",
99 							   '\t' : "<TAB>",
100 							   '\r' : "<CR>",
101 							   '\0' : "<NULL>" ];
102 
103 	return translate(s,meta);
104 	}
105 
106 	unittest
107 	{
108 	string s="1\t2\t3\n";
109 	auto t = quotemeta(s);
110 	assert(t=="1<TAB>2<TAB>3<LF>");
111 
112 	//String with null
113 	s="1\0002";
114 	t = quotemeta(s);
115 	assert(t=="1<NULL>2");
116 
117 	//Empty string
118 	s="";
119 	t = quotemeta(s);
120 	assert(t=="");
121 
122 	// Normal string
123 	s="1\\t2";
124 	t = quotemeta(s);
125 	assert(t=="1\\t2");
126 	}
127 
128 	@safe pure string quotemeta(const char c)
129 	{
130 	string[dchar] meta = [ '\n' : "<LF>",
131 							   '\t' : "<TAB>",
132 							   '\r' : "<CR>",
133 							   '\0' : "<NULL>" ];
134 	if (c in meta)
135 			return meta[c];
136 
137 	return [c];
138 	}
139 
140 	unittest
141 	{
142 	assert(quotemeta('\t')=="<TAB>");
143 	assert(quotemeta('\r')=="<CR>");
144 	assert(quotemeta('\n')=="<LF>");
145 	assert(quotemeta('\00')=="<NULL>");
146 	assert(quotemeta('t')=="t");
147 	}
148 
149 } // private
150 
151 
152 /**
153    Parses string $(D input), delimited by character $(D delimiter), into a tuple of variables $(arg).
154 
155    Returns:
156    On success, the function returns nothing (void), and all the members of the tuple are populated.
157 
158    Throws:
159    $(XREF std.exception.Exception) on failure to correctly parse the string.
160 
161    Example:
162    ----
163    string s = "Hello World 42";
164    Tuple!(string,string,int) t;
165    parseDelimited(s,' ',t);
166    assert(t[0]=="Hello");
167    assert(t[1]=="World");
168    assert(t[2]==42);
169    ----
170 
171    Notes:
172    $(OL
173    $(LI Parsing is much stricter (and less tolerant) than $(XREF std.format.formattedRead))
174    $(LI White-space is never automatically skipped)
175    $(LI A space delimiter consume only space character (ASCII 20), not TAB (ASCII 9))
176    $(LI Multiple consecutive delimiters are not consumed as one delimiter (e.g. "1\t\t\t2" is considerd a string with four fields - it has three delimiters. It will throw an exception because empty fields are not allowed).)
177    $(LI All fields must exist (i.e. if the tuple $(D arg) has 3 members, the $(D input) string must contain two delimiters and three valid values))
178    $(LI For a string field, empty values are not acceptable, will throw an exception)
179    $(LI Extra characters at the end of a field or the line will throw an exception)
180    )
181 
182 */
183 @safe void parseDelimited(Data)(const string input,
184 								const char delimiter,
185 								ref Data arg)
186 {
187 	string remainingInput = input;
188 
189 	foreach (i, T; Data.Types)
190 	{
191 		//TODO: Handle other types (for now, only numeric or strings)
192 		static if (isNumeric!T)
193 		{
194 			try
195 			{
196 				// consume a numeric field
197 				static import std.conv;
198 				arg[i] = std.conv.parse!T(remainingInput);
199 			}
200 			catch ( std.conv.ConvException e )
201 			{
202 				throw new Exception(text("failed to parse numeric value in field ", i+1,
203 										 " (text is '",quotemeta(remainingInput),"')"));
204 			}
205 		}
206 		else
207  	{
208 			// consume a string field
209 			arg[i] = consumeStringField(remainingInput,delimiter);
210 			if (arg[i].empty)
211 				throw new Exception(text("empty text at field ", i+1,
212 										 " (remaining text is '",quotemeta(remainingInput),"')"));
213 		}
214 
215 		static if (i<Data.length-1)
216 		{
217 			//Not the last field - require more input
218 			if (remainingInput.empty)
219 				throw new Exception(text("input terminated too soon (expecting ",
220 										 Data.length," fields, got ", i+1, ")"));
221 
222 			//Following the converted value of this field,
223 			//require a delimiter (to prevent extra characters, even whitespace)
224 			if (remainingInput[0] != delimiter)
225 				throw new Exception(text("extra characters in field ",i+1,
226 										 " (starting at '",quotemeta(remainingInput),"')"));
227 			consumeDelimiter(remainingInput,delimiter);
228 		}
229 		else
230 		{
231 			// Last field: check for extra input
232 			if (!remainingInput.empty)
233 				throw new Exception(text("extra characters in last field ",i+1,
234 										 " (starting at '",quotemeta(remainingInput),"')"));
235 		}
236 
237 	}
238 }
239 
240 unittest {
241 	Tuple!(int,string,int) a;
242 	parseDelimited("1 2 3",' ',a);
243 	assert(a[0]==1 && a[1]=="2" && a[2]==3);
244 
245 	parseDelimited("1\t2\t3",'\t',a);
246 	assert(a[0]==1 && a[1]=="2" && a[2]==3);
247 
248 	//Extra delimiter at the end of the line is not OK
249 	assertThrown!Exception(parseDelimited("1 2 3 ",' ',a));
250 
251 	//Invalid number on first field (parse!int should fail)
252 	assertThrown!Exception(parseDelimited(".1 2 3",' ',a));
253 
254 	//Extra characters in field 1 (After successfull parse!int)
255 	assertThrown!Exception(parseDelimited("1. 2 3",' ',a));
256 
257 	//Line contains too many fields
258 	assertThrown!Exception(parseDelimited("1 2 3 4",' ',a));
259 
260 	//Line is too short
261 	assertThrown!Exception(parseDelimited("1 2",' ',a));
262 
263 	//non-space/tab delimiter is fine
264 	parseDelimited("1|2|3",'|',a);
265 	assert(a[0]==1 && a[1]=="2" && a[2]==3);
266 	parseDelimited("1|  2  |3",'|',a);
267 	assert(a[0]==1 && a[1]=="  2  " && a[2]==3);
268 
269 	//Spaces are bad (and not ignored) if delimiter is not space (for numeric fields)
270 	assertThrown!Exception(parseDelimited("1 |2|3",'|',a));
271 	assertThrown!Exception(parseDelimited(" 1|2|3",'|',a));
272 	assertThrown!Exception(parseDelimited(" 1|2| 3",'|',a));
273 	assertThrown!Exception(parseDelimited("1|2|3 ",'|',a));
274 
275 	//For string fields, empty values are not OK (different from formattedRead())
276 	assertThrown!Exception(parseDelimited("1||3",'|',a));
277 
278 	//For string fields, last value can't be empty (different from formattedRead())
279 	Tuple!(int,string,string) b;
280 	assertThrown!Exception(parseDelimited("1|2|",'|',b));
281 
282 	//One field is OK
283 	Tuple!(string) c;
284 	parseDelimited("foo",' ',c);
285 	assert(c[0]=="foo");
286 
287 	//Fields that are OK for floating-point types should not work for integers (extra characters)
288 	Tuple!(real,int) d;
289 	parseDelimited("4.5 9",' ',d);
290 	assert(d[0]==4.5 && d[1]==9);
291 	Tuple!(int,real) e;
292 	assertThrown!Exception(parseDelimited("4.5 9",' ',e));
293 
294 	//scientific notation - OK for floating-point types
295 	Tuple!(double,double) f;
296 	parseDelimited("-0.004e3 +4.3e10",' ',f);
297 	assert(f[0]==-0.004e3 && f[1]==43e9);
298 
299 	//Scientific notation - fails for integars
300 	Tuple!(int,int) g;
301 	assertThrown!Exception(parseDelimited("-0.004e3 +4.3e10",' ',g));
302 }
303 
304 
305 /**
306    Loads a delimited text file, line-by-line, parses the line into fields, and calls a delegate/function for each line.
307 
308    Returns:
309    On success, the function returns nothing (void), the call back function have been called for every line.
310 
311    Throws:
312    $(XREF std.exception.Exception) on failure to correctly parse a line.
313    $(XREF std.file.FileException) on I/O failures.
314 
315    Example:
316    ----
317 // Load a text file with three numeric columns,
318 // Store the tuple in an array
319 // (NOTE: this is a naive, inefficient way to populate an array, see NOTES)
320 alias Tuple!(int,int,int) T;
321 T[] t;
322 tabular!( T,		   // The number and types of the (expected) fields in the file
323 delegate(x)
324 { t ~= x; }, // for each line read, call this function. X will be of type T.
325 '\t'		 // The delimiter (default = TAB)
326 )("file.txt"); // The file name to read.
327 ----
328 
329 Example:
330 ----
331 // Load a text file with three numeric columns,
332 // Use the second column as a KEY and the third column as the VALUE.
333 alias Tuple!(int,int,int) T;
334 int[int] data;
335 tabular!( T,			  // The number and types of the (expected) fields in the file
336 delegate(x)
337 {   // for each line read, call this function. X will be of type T.
338 data[x[1]] = x[2] ;
339 },
340 '\t'			 // The delimiter (default = TAB)
341 )("file.txt");	// The file name to read.
342 ----
343 
344 Notes:
345 $(OL
346 $(LI See $(LREF parseDelimited) for details about parsing the delimited lines of the fiile)
347 $(LO
348 )
349 
350 TODO: Make this an InputRange
351 
352 */
353 void tabular(Members, alias storeFunction, char delimiter='\t')(const string filename)
354 {
355 	static assert (isTuple!Members,"tabular: 1st template parameter must be a Tuple with the expected columns in the file");
356 
357 	auto f = File(filename);
358 	scope(exit) f.close();
359 	auto lines=0;
360 
361 	alias unaryFun!storeFunction _Fun;
362 	Members data;
363 
364 	import nxt.bylinefast: byLineFast;
365 	foreach (origline; f.byLineFast())
366 	{
367 		++lines;
368 		string line = origline.idup;
369 		try
370 		{
371 			parseDelimited(line, delimiter, data);
372 			_Fun(data);
373 		}
374 		catch ( Exception e )
375 		{
376 			throw new FileException(filename,text("invalid input at line ", lines,
377 												  ": expected ", data.tupleof.length,
378 												  " fields ",typeof(data.tupleof).stringof,
379 												  " delimiter by '",quotemeta(delimiter),
380 												  "' got '", origline,
381 												  "' error details: ", e.msg ));
382 		}
383 	}
384 }
385 
386 unittest {
387 	import std.file ;
388 	auto deleteme = testFilename();
389 	write(deleteme,"1 2 3\n4 5 6\n");
390 	scope(exit)
391 	{ assert(exists(deleteme)); remove(deleteme); }
392 
393 	//Load a text file, with three fields, delimiter with spaces.
394 	alias Tuple!(int,int,int) T;
395 	T[] t;
396 	tabular!( T,		 // The number and types of the (expected) fields in the file
397 			 delegate(x)
398 			 { t ~= x; }, // for each line read, call this function. X will be of type T.
399 			 ' '		// The delimiter (default = TAB)
400 		)(deleteme); // The file name to read.
401 	assert(t.length==2);
402 	assert(t[0] == tuple(1,2,3));
403 	assert(t[1] == tuple(4,5,6));
404 
405 	//Any kind of invalid data should throw an exception
406 	//NOTE: the delegate function does nothing, because we don't care about the data
407 	//	  in this test.
408 	//NOTE: see more test cases for failed parsing in the unittest of 'parseDelimited'.
409 	auto deleteme2 = testFilename() ~ ".2";
410 	write(deleteme2,"1 Foo 3\n4 5 6\n"); // conversion will fail in the first line
411 	scope(exit)
412 		{ assert(exists(deleteme2)); remove(deleteme2); }
413 	assertThrown!Exception( tabular!( T, (x) => {}, ' ')(deleteme2)) ;
414 }
415 
416 /**
417 Loads a delimited text file, line-by-line, parses the line into fields, returns an array of fields.
418 
419 Returns:
420 On success, returns an array of tuples, based on template parameters.
421 
422 Throws:
423 $(XREF std.exception.Exception) on failure to correctly parse a line.
424 $(XREF std.file.FileException) on I/O failures.
425 
426 Example:
427 ----
428 // Load a text file, tab-delimited, with three numeric columns.
429 
430 auto data = tabularArray!('\t', int,int,int)("file.txt");
431 
432 // data[0] will be of type Tuple!(int,int,int)
433 ----
434 */
435 Select!(Types.length == 1, Types[0][], Tuple!(Types)[])
436 tabularArray(char delimiter, Types...)(string filename)
437 {
438 	alias RetT = typeof(return);
439 
440 	RetT result;
441 	Appender!RetT app;
442 	alias Members = ElementType!RetT;
443 
444 	tabular! ( Members, x => app.put(x) , delimiter ) (filename);
445 
446 	return app.data;
447 }
448 
449 unittest {
450 	import std.file ;
451 	auto deleteme = testFilename() ~ ".3";
452 	write(deleteme,"1 2 3\n4 5 6\n");
453 	scope(exit)
454 	{ assert(exists(deleteme)); remove(deleteme); }
455 
456 	//Load a text file, with three fields, delimiter with spaces.
457 	auto t = tabularArray!( ' ', // delimiter
458 							int, int, int // expected fields in the text file
459 		)(deleteme);
460 	assert(t.length==2);
461 	assert(t[0] == tuple(1,2,3));
462 	assert(t[1] == tuple(4,5,6));
463 }
464 
465 version (unittest) string testFilename(string file = __FILE__, size_t line = __LINE__)
466 {
467 	import std.path;
468 	import std.process: thisProcessID;
469 	return text("deleteme-.", thisProcessID(), ".", baseName(file), ".", line);
470 }
471 
472 /*
473 On Thursday, 16 May 2013 at 10:35:12 UTC, Dicebot wrote:
474 > Want to bring into discussion people that are not on Google+.
475 > Samuel recently has posted there some simple experiments with
476 > bioinformatics and bad performance of Phobos-based snippet has
477 > surprised me.
478 >
479 > I did explore issue a bit and reported results in a blog post
480 > (snippets are really small and simple) :
481 > http://dicebot.blogspot.com/2013/05/short-performance-tuning-story.html
482 >
483 > One open question remains though - can D/Phobos do better here?
484 > Can some changes be done to Phobos functions in question to
485 > improve performance or creating bioinformatics-specialized
486 > library is only practical solution?
487 
488 I bet the problem is in readln. Currently, File.byLine() and
489 readln() are extremely slow, because they call fgetc() one char
490 at a time.
491 
492 I made an "byLineFast" implementation some time ago that is 10x
493 faster than std.stdio.byLine. It reads lines through rawRead, and
494 using buffers instead of char by char.
495 
496 I don't have the time to make it phobos-ready (unicode, etc.).
497 But I'll paste it here for any one to use (it works perfectly).
498 
499 --jm
500 */