1 /** 2 Generic Loader for delimited text files. 3 4 $(LREF tabular) is the main function to be used. 5 6 Copyright: Copyright 2013 the authors. 7 8 License: BSD 3-Clause 9 10 Authors: $(WEB https://github.com/agordon/ , A. Gordon), JM 11 */ 12 module nxt.tabular; 13 14 // import std.typetuple; 15 import std.traits: isNumeric, Select; 16 import std.typecons: Tuple, tuple, isTuple; 17 import std.functional: unaryFun; 18 import std.string: translate; 19 // import std.array; 20 import std.conv: text; 21 import std.exception: assertThrown; 22 import std.stdio: File; 23 import std.file: FileException; 24 import std.range; 25 26 private 27 { 28 @safe pure void consumeDelimiter(S, D)(ref S inputString, const D delimiter) 29 { 30 if (inputString.empty || inputString[0] != delimiter) 31 throw new Exception("missing delimiter"); 32 33 inputString = inputString[1..$]; 34 } 35 36 unittest 37 { 38 string s = "\t2\t3"; 39 consumeDelimiter(s,'\t'); 40 assert(s=="2\t3"); 41 //Trying to remove a delimiter when non is available is a throwable offense 42 assertThrown!Exception(consumeDelimiter(s,'\t')); 43 //Trying to remove a delimiter from an empty string is a throwable offense 44 s = ""; 45 assertThrown!Exception(consumeDelimiter(s,' ')); 46 } 47 48 @safe S consumeStringField(S,D)(ref S inputString, const D delimiter) 49 { 50 size_t j = inputString.length; 51 foreach (i, dchar c; inputString) 52 { 53 if ( c == delimiter ) 54 { 55 j = i; 56 break; 57 } 58 } 59 scope(exit) inputString = inputString[j .. $]; 60 return inputString[0 .. j]; 61 } 62 63 unittest 64 { 65 // Consume the first field 66 string s = "hello\tworld"; 67 string t = consumeStringField(s,'\t'); 68 assert(s=="\tworld"); 69 assert(t=="hello"); 70 71 // Consume the next (and last) field 72 consumeDelimiter(s,'\t'); 73 t = consumeStringField(s,'\t'); 74 assert(s==""); 75 assert(t=="world"); 76 77 // No string before delimiter - return an empty string 78 s = "\tfoo\tbar"; 79 t = consumeStringField(s,'\t'); 80 assert(s=="\tfoo\tbar"); 81 assert(t==""); 82 83 // Empty string - is a valid single (empty) field 84 s = ""; 85 t = consumeStringField(s,'\t'); 86 assert(s==""); 87 assert(t==""); 88 89 // No delimiter in string - treat it as a valid single field 90 s = "hello world"; 91 t = consumeStringField(s,'\t'); 92 assert(s==""); 93 assert(t=="hello world"); 94 } 95 96 @safe pure S quotemeta(S)(const S s) 97 { 98 string[dchar] meta = [ '\n' : "<LF>", 99 '\t' : "<TAB>", 100 '\r' : "<CR>", 101 '\0' : "<NULL>" ]; 102 103 return translate(s,meta); 104 } 105 106 unittest 107 { 108 string s="1\t2\t3\n"; 109 auto t = quotemeta(s); 110 assert(t=="1<TAB>2<TAB>3<LF>"); 111 112 //String with null 113 s="1\0002"; 114 t = quotemeta(s); 115 assert(t=="1<NULL>2"); 116 117 //Empty string 118 s=""; 119 t = quotemeta(s); 120 assert(t==""); 121 122 // Normal string 123 s="1\\t2"; 124 t = quotemeta(s); 125 assert(t=="1\\t2"); 126 } 127 128 @safe pure string quotemeta(const char c) 129 { 130 string[dchar] meta = [ '\n' : "<LF>", 131 '\t' : "<TAB>", 132 '\r' : "<CR>", 133 '\0' : "<NULL>" ]; 134 if (c in meta) 135 return meta[c]; 136 137 return [c]; 138 } 139 140 unittest 141 { 142 assert(quotemeta('\t')=="<TAB>"); 143 assert(quotemeta('\r')=="<CR>"); 144 assert(quotemeta('\n')=="<LF>"); 145 assert(quotemeta('\00')=="<NULL>"); 146 assert(quotemeta('t')=="t"); 147 } 148 149 } // private 150 151 152 /** 153 Parses string $(D input), delimited by character $(D delimiter), into a tuple of variables $(arg). 154 155 Returns: 156 On success, the function returns nothing (void), and all the members of the tuple are populated. 157 158 Throws: 159 $(XREF std.exception.Exception) on failure to correctly parse the string. 160 161 Example: 162 ---- 163 string s = "Hello World 42"; 164 Tuple!(string,string,int) t; 165 parseDelimited(s,' ',t); 166 assert(t[0]=="Hello"); 167 assert(t[1]=="World"); 168 assert(t[2]==42); 169 ---- 170 171 Notes: 172 $(OL 173 $(LI Parsing is much stricter (and less tolerant) than $(XREF std.format.formattedRead)) 174 $(LI White-space is never automatically skipped) 175 $(LI A space delimiter consume only space character (ASCII 20), not TAB (ASCII 9)) 176 $(LI Multiple consecutive delimiters are not consumed as one delimiter (e.g. "1\t\t\t2" is considerd a string with four fields - it has three delimiters. It will throw an exception because empty fields are not allowed).) 177 $(LI All fields must exist (i.e. if the tuple $(D arg) has 3 members, the $(D input) string must contain two delimiters and three valid values)) 178 $(LI For a string field, empty values are not acceptable, will throw an exception) 179 $(LI Extra characters at the end of a field or the line will throw an exception) 180 ) 181 182 */ 183 @safe void parseDelimited(Data)(const string input, 184 const char delimiter, 185 ref Data arg) 186 { 187 string remainingInput = input; 188 189 foreach (i, T; Data.Types) 190 { 191 //TODO: Handle other types (for now, only numeric or strings) 192 static if (isNumeric!T) 193 { 194 try 195 { 196 // consume a numeric field 197 static import std.conv; 198 arg[i] = std.conv.parse!T(remainingInput); 199 } 200 catch ( std.conv.ConvException e ) 201 { 202 throw new Exception(text("failed to parse numeric value in field ", i+1, 203 " (text is '",quotemeta(remainingInput),"')")); 204 } 205 } 206 else 207 { 208 // consume a string field 209 arg[i] = consumeStringField(remainingInput,delimiter); 210 if (arg[i].empty) 211 throw new Exception(text("empty text at field ", i+1, 212 " (remaining text is '",quotemeta(remainingInput),"')")); 213 } 214 215 static if (i<Data.length-1) 216 { 217 //Not the last field - require more input 218 if (remainingInput.empty) 219 throw new Exception(text("input terminated too soon (expecting ", 220 Data.length," fields, got ", i+1, ")")); 221 222 //Following the converted value of this field, 223 //require a delimiter (to prevent extra characters, even whitespace) 224 if (remainingInput[0] != delimiter) 225 throw new Exception(text("extra characters in field ",i+1, 226 " (starting at '",quotemeta(remainingInput),"')")); 227 consumeDelimiter(remainingInput,delimiter); 228 } 229 else 230 { 231 // Last field: check for extra input 232 if (!remainingInput.empty) 233 throw new Exception(text("extra characters in last field ",i+1, 234 " (starting at '",quotemeta(remainingInput),"')")); 235 } 236 237 } 238 } 239 240 unittest 241 { 242 Tuple!(int,string,int) a; 243 parseDelimited("1 2 3",' ',a); 244 assert(a[0]==1 && a[1]=="2" && a[2]==3); 245 246 parseDelimited("1\t2\t3",'\t',a); 247 assert(a[0]==1 && a[1]=="2" && a[2]==3); 248 249 //Extra delimiter at the end of the line is not OK 250 assertThrown!Exception(parseDelimited("1 2 3 ",' ',a)); 251 252 //Invalid number on first field (parse!int should fail) 253 assertThrown!Exception(parseDelimited(".1 2 3",' ',a)); 254 255 //Extra characters in field 1 (After successfull parse!int) 256 assertThrown!Exception(parseDelimited("1. 2 3",' ',a)); 257 258 //Line contains too many fields 259 assertThrown!Exception(parseDelimited("1 2 3 4",' ',a)); 260 261 //Line is too short 262 assertThrown!Exception(parseDelimited("1 2",' ',a)); 263 264 //non-space/tab delimiter is fine 265 parseDelimited("1|2|3",'|',a); 266 assert(a[0]==1 && a[1]=="2" && a[2]==3); 267 parseDelimited("1| 2 |3",'|',a); 268 assert(a[0]==1 && a[1]==" 2 " && a[2]==3); 269 270 //Spaces are bad (and not ignored) if delimiter is not space (for numeric fields) 271 assertThrown!Exception(parseDelimited("1 |2|3",'|',a)); 272 assertThrown!Exception(parseDelimited(" 1|2|3",'|',a)); 273 assertThrown!Exception(parseDelimited(" 1|2| 3",'|',a)); 274 assertThrown!Exception(parseDelimited("1|2|3 ",'|',a)); 275 276 //For string fields, empty values are not OK (different from formattedRead()) 277 assertThrown!Exception(parseDelimited("1||3",'|',a)); 278 279 //For string fields, last value can't be empty (different from formattedRead()) 280 Tuple!(int,string,string) b; 281 assertThrown!Exception(parseDelimited("1|2|",'|',b)); 282 283 //One field is OK 284 Tuple!(string) c; 285 parseDelimited("foo",' ',c); 286 assert(c[0]=="foo"); 287 288 //Fields that are OK for floating-point types should not work for integers (extra characters) 289 Tuple!(real,int) d; 290 parseDelimited("4.5 9",' ',d); 291 assert(d[0]==4.5 && d[1]==9); 292 Tuple!(int,real) e; 293 assertThrown!Exception(parseDelimited("4.5 9",' ',e)); 294 295 //scientific notation - OK for floating-point types 296 Tuple!(double,double) f; 297 parseDelimited("-0.004e3 +4.3e10",' ',f); 298 assert(f[0]==-0.004e3 && f[1]==43e9); 299 300 //Scientific notation - fails for integars 301 Tuple!(int,int) g; 302 assertThrown!Exception(parseDelimited("-0.004e3 +4.3e10",' ',g)); 303 } 304 305 306 /** 307 Loads a delimited text file, line-by-line, parses the line into fields, and calls a delegate/function for each line. 308 309 Returns: 310 On success, the function returns nothing (void), the call back function have been called for every line. 311 312 Throws: 313 $(XREF std.exception.Exception) on failure to correctly parse a line. 314 $(XREF std.file.FileException) on I/O failures. 315 316 Example: 317 ---- 318 // Load a text file with three numeric columns, 319 // Store the tuple in an array 320 // (NOTE: this is a naive, inefficient way to populate an array, see NOTES) 321 alias Tuple!(int,int,int) T; 322 T[] t; 323 tabular!( T, // The number and types of the (expected) fields in the file 324 delegate(x) 325 { t ~= x; }, // for each line read, call this function. X will be of type T. 326 '\t' // The delimiter (default = TAB) 327 )("file.txt"); // The file name to read. 328 ---- 329 330 Example: 331 ---- 332 // Load a text file with three numeric columns, 333 // Use the second column as a KEY and the third column as the VALUE. 334 alias Tuple!(int,int,int) T; 335 int[int] data; 336 tabular!( T, // The number and types of the (expected) fields in the file 337 delegate(x) 338 { // for each line read, call this function. X will be of type T. 339 data[x[1]] = x[2] ; 340 }, 341 '\t' // The delimiter (default = TAB) 342 )("file.txt"); // The file name to read. 343 ---- 344 345 Notes: 346 $(OL 347 $(LI See $(LREF parseDelimited) for details about parsing the delimited lines of the fiile) 348 $(LO 349 ) 350 351 TODO: Make this an InputRange 352 353 */ 354 void tabular(Members, alias storeFunction, char delimiter='\t')(const string filename) 355 { 356 static assert (isTuple!Members,"tabular: 1st template parameter must be a Tuple with the expected columns in the file"); 357 358 auto f = File(filename); 359 scope(exit) f.close(); 360 auto lines=0; 361 362 alias unaryFun!storeFunction _Fun; 363 Members data; 364 365 import nxt.bylinefast: byLineFast; 366 foreach (origline; f.byLineFast()) 367 { 368 ++lines; 369 string line = origline.idup; 370 try 371 { 372 parseDelimited(line, delimiter, data); 373 _Fun(data); 374 } 375 catch ( Exception e ) 376 { 377 throw new FileException(filename,text("invalid input at line ", lines, 378 ": expected ", data.tupleof.length, 379 " fields ",typeof(data.tupleof).stringof, 380 " delimiter by '",quotemeta(delimiter), 381 "' got '", origline, 382 "' error details: ", e.msg )); 383 } 384 } 385 } 386 387 unittest 388 { 389 import std.file ; 390 auto deleteme = testFilename(); 391 write(deleteme,"1 2 3\n4 5 6\n"); 392 scope(exit) 393 { assert(exists(deleteme)); remove(deleteme); } 394 395 //Load a text file, with three fields, delimiter with spaces. 396 alias Tuple!(int,int,int) T; 397 T[] t; 398 tabular!( T, // The number and types of the (expected) fields in the file 399 delegate(x) 400 { t ~= x; }, // for each line read, call this function. X will be of type T. 401 ' ' // The delimiter (default = TAB) 402 )(deleteme); // The file name to read. 403 assert(t.length==2); 404 assert(t[0] == tuple(1,2,3)); 405 assert(t[1] == tuple(4,5,6)); 406 407 //Any kind of invalid data should throw an exception 408 //NOTE: the delegate function does nothing, because we don't care about the data 409 // in this test. 410 //NOTE: see more test cases for failed parsing in the unittest of 'parseDelimited'. 411 auto deleteme2 = testFilename() ~ ".2"; 412 write(deleteme2,"1 Foo 3\n4 5 6\n"); // conversion will fail in the first line 413 scope(exit) 414 { assert(exists(deleteme2)); remove(deleteme2); } 415 assertThrown!Exception( tabular!( T, (x) => {}, ' ')(deleteme2)) ; 416 } 417 418 /** 419 Loads a delimited text file, line-by-line, parses the line into fields, returns an array of fields. 420 421 Returns: 422 On success, returns an array of tuples, based on template parameters. 423 424 Throws: 425 $(XREF std.exception.Exception) on failure to correctly parse a line. 426 $(XREF std.file.FileException) on I/O failures. 427 428 Example: 429 ---- 430 // Load a text file, tab-delimited, with three numeric columns. 431 432 auto data = tabularArray!('\t', int,int,int)("file.txt"); 433 434 // data[0] will be of type Tuple!(int,int,int) 435 ---- 436 */ 437 Select!(Types.length == 1, Types[0][], Tuple!(Types)[]) 438 tabularArray(char delimiter, Types...)(string filename) 439 { 440 alias RetT = typeof(return); 441 442 RetT result; 443 Appender!RetT app; 444 alias Members = ElementType!RetT; 445 446 tabular! ( Members, x => app.put(x) , delimiter ) (filename); 447 448 return app.data; 449 } 450 451 unittest 452 { 453 import std.file ; 454 auto deleteme = testFilename() ~ ".3"; 455 write(deleteme,"1 2 3\n4 5 6\n"); 456 scope(exit) 457 { assert(exists(deleteme)); remove(deleteme); } 458 459 //Load a text file, with three fields, delimiter with spaces. 460 auto t = tabularArray!( ' ', // delimiter 461 int, int, int // expected fields in the text file 462 )(deleteme); 463 assert(t.length==2); 464 assert(t[0] == tuple(1,2,3)); 465 assert(t[1] == tuple(4,5,6)); 466 } 467 468 version(unittest) string testFilename(string file = __FILE__, size_t line = __LINE__) 469 { 470 import std.path; 471 import std.process: thisProcessID; 472 return text("deleteme-.", thisProcessID(), ".", baseName(file), ".", line); 473 } 474 475 /* 476 On Thursday, 16 May 2013 at 10:35:12 UTC, Dicebot wrote: 477 > Want to bring into discussion people that are not on Google+. 478 > Samuel recently has posted there some simple experiments with 479 > bioinformatics and bad performance of Phobos-based snippet has 480 > surprised me. 481 > 482 > I did explore issue a bit and reported results in a blog post 483 > (snippets are really small and simple) : 484 > http://dicebot.blogspot.com/2013/05/short-performance-tuning-story.html 485 > 486 > One open question remains though - can D/Phobos do better here? 487 > Can some changes be done to Phobos functions in question to 488 > improve performance or creating bioinformatics-specialized 489 > library is only practical solution? 490 491 I bet the problem is in readln. Currently, File.byLine() and 492 readln() are extremely slow, because they call fgetc() one char 493 at a time. 494 495 I made an "byLineFast" implementation some time ago that is 10x 496 faster than std.stdio.byLine. It reads lines through rawRead, and 497 using buffers instead of char by char. 498 499 I don't have the time to make it phobos-ready (unicode, etc.). 500 But I'll paste it here for any one to use (it works perfectly). 501 502 --jm 503 */