1 /** 2 Generic Loader for delimited text files. 3 4 $(LREF tabular) is the main function to be used. 5 6 Copyright: Copyright 2013 the authors. 7 8 License: BSD 3-Clause 9 10 Authors: $(WEB https://github.com/agordon/ , A. Gordon), JM 11 */ 12 module nxt.tabular; 13 14 // import std.typetuple; 15 import std.traits: isNumeric, Select; 16 import std.typecons: Tuple, tuple, isTuple; 17 import std.functional: unaryFun; 18 import std.string: translate; 19 // import std.array; 20 import std.conv: text; 21 import std.exception: assertThrown; 22 import std.stdio: File; 23 import std.file: FileException; 24 import std.range; 25 26 private 27 { 28 @safe pure void consumeDelimiter(S, D)(ref S inputString, const D delimiter) 29 { 30 if (inputString.empty || inputString[0] != delimiter) 31 throw new Exception("missing delimiter"); 32 33 inputString = inputString[1..$]; 34 } 35 36 unittest 37 { 38 string s = "\t2\t3"; 39 consumeDelimiter(s,'\t'); 40 assert(s=="2\t3"); 41 //Trying to remove a delimiter when non is available is a throwable offense 42 assertThrown!Exception(consumeDelimiter(s,'\t')); 43 //Trying to remove a delimiter from an empty string is a throwable offense 44 s = ""; 45 assertThrown!Exception(consumeDelimiter(s,' ')); 46 } 47 48 @safe S consumeStringField(S,D)(ref S inputString, const D delimiter) 49 { 50 size_t j = inputString.length; 51 foreach (i, dchar c; inputString) 52 { 53 if ( c == delimiter ) 54 { 55 j = i; 56 break; 57 } 58 } 59 scope(exit) inputString = inputString[j .. $]; 60 return inputString[0 .. j]; 61 } 62 63 unittest 64 { 65 // Consume the first field 66 string s = "hello\tworld"; 67 string t = consumeStringField(s,'\t'); 68 assert(s=="\tworld"); 69 assert(t=="hello"); 70 71 // Consume the next (and last) field 72 consumeDelimiter(s,'\t'); 73 t = consumeStringField(s,'\t'); 74 assert(s==""); 75 assert(t=="world"); 76 77 // No string before delimiter - return an empty string 78 s = "\tfoo\tbar"; 79 t = consumeStringField(s,'\t'); 80 assert(s=="\tfoo\tbar"); 81 assert(t==""); 82 83 // Empty string - is a valid single (empty) field 84 s = ""; 85 t = consumeStringField(s,'\t'); 86 assert(s==""); 87 assert(t==""); 88 89 // No delimiter in string - treat it as a valid single field 90 s = "hello world"; 91 t = consumeStringField(s,'\t'); 92 assert(s==""); 93 assert(t=="hello world"); 94 } 95 96 @safe pure S quotemeta(S)(const S s) 97 { 98 string[dchar] meta = [ '\n' : "<LF>", 99 '\t' : "<TAB>", 100 '\r' : "<CR>", 101 '\0' : "<NULL>" ]; 102 103 return translate(s,meta); 104 } 105 106 unittest 107 { 108 string s="1\t2\t3\n"; 109 auto t = quotemeta(s); 110 assert(t=="1<TAB>2<TAB>3<LF>"); 111 112 //String with null 113 s="1\0002"; 114 t = quotemeta(s); 115 assert(t=="1<NULL>2"); 116 117 //Empty string 118 s=""; 119 t = quotemeta(s); 120 assert(t==""); 121 122 // Normal string 123 s="1\\t2"; 124 t = quotemeta(s); 125 assert(t=="1\\t2"); 126 } 127 128 @safe pure string quotemeta(const char c) 129 { 130 string[dchar] meta = [ '\n' : "<LF>", 131 '\t' : "<TAB>", 132 '\r' : "<CR>", 133 '\0' : "<NULL>" ]; 134 if (c in meta) 135 return meta[c]; 136 137 return [c]; 138 } 139 140 unittest 141 { 142 assert(quotemeta('\t')=="<TAB>"); 143 assert(quotemeta('\r')=="<CR>"); 144 assert(quotemeta('\n')=="<LF>"); 145 assert(quotemeta('\00')=="<NULL>"); 146 assert(quotemeta('t')=="t"); 147 } 148 149 } // private 150 151 152 /** 153 Parses string $(D input), delimited by character $(D delimiter), into a tuple of variables $(arg). 154 155 Returns: 156 On success, the function returns nothing (void), and all the members of the tuple are populated. 157 158 Throws: 159 $(XREF std.exception.Exception) on failure to correctly parse the string. 160 161 Example: 162 ---- 163 string s = "Hello World 42"; 164 Tuple!(string,string,int) t; 165 parseDelimited(s,' ',t); 166 assert(t[0]=="Hello"); 167 assert(t[1]=="World"); 168 assert(t[2]==42); 169 ---- 170 171 Notes: 172 $(OL 173 $(LI Parsing is much stricter (and less tolerant) than $(XREF std.format.formattedRead)) 174 $(LI White-space is never automatically skipped) 175 $(LI A space delimiter consume only space character (ASCII 20), not TAB (ASCII 9)) 176 $(LI Multiple consecutive delimiters are not consumed as one delimiter (e.g. "1\t\t\t2" is considerd a string with four fields - it has three delimiters. It will throw an exception because empty fields are not allowed).) 177 $(LI All fields must exist (i.e. if the tuple $(D arg) has 3 members, the $(D input) string must contain two delimiters and three valid values)) 178 $(LI For a string field, empty values are not acceptable, will throw an exception) 179 $(LI Extra characters at the end of a field or the line will throw an exception) 180 ) 181 182 */ 183 @safe void parseDelimited(Data)(const string input, 184 const char delimiter, 185 ref Data arg) 186 { 187 string remainingInput = input; 188 189 foreach (i, T; Data.Types) 190 { 191 //TODO: Handle other types (for now, only numeric or strings) 192 static if (isNumeric!T) 193 { 194 try 195 { 196 // consume a numeric field 197 static import std.conv; 198 arg[i] = std.conv.parse!T(remainingInput); 199 } 200 catch ( std.conv.ConvException e ) 201 { 202 throw new Exception(text("failed to parse numeric value in field ", i+1, 203 " (text is '",quotemeta(remainingInput),"')")); 204 } 205 } 206 else 207 { 208 // consume a string field 209 arg[i] = consumeStringField(remainingInput,delimiter); 210 if (arg[i].empty) 211 throw new Exception(text("empty text at field ", i+1, 212 " (remaining text is '",quotemeta(remainingInput),"')")); 213 } 214 215 static if (i<Data.length-1) 216 { 217 //Not the last field - require more input 218 if (remainingInput.empty) 219 throw new Exception(text("input terminated too soon (expecting ", 220 Data.length," fields, got ", i+1, ")")); 221 222 //Following the converted value of this field, 223 //require a delimiter (to prevent extra characters, even whitespace) 224 if (remainingInput[0] != delimiter) 225 throw new Exception(text("extra characters in field ",i+1, 226 " (starting at '",quotemeta(remainingInput),"')")); 227 consumeDelimiter(remainingInput,delimiter); 228 } 229 else 230 { 231 // Last field: check for extra input 232 if (!remainingInput.empty) 233 throw new Exception(text("extra characters in last field ",i+1, 234 " (starting at '",quotemeta(remainingInput),"')")); 235 } 236 237 } 238 } 239 240 unittest { 241 Tuple!(int,string,int) a; 242 parseDelimited("1 2 3",' ',a); 243 assert(a[0]==1 && a[1]=="2" && a[2]==3); 244 245 parseDelimited("1\t2\t3",'\t',a); 246 assert(a[0]==1 && a[1]=="2" && a[2]==3); 247 248 //Extra delimiter at the end of the line is not OK 249 assertThrown!Exception(parseDelimited("1 2 3 ",' ',a)); 250 251 //Invalid number on first field (parse!int should fail) 252 assertThrown!Exception(parseDelimited(".1 2 3",' ',a)); 253 254 //Extra characters in field 1 (After successfull parse!int) 255 assertThrown!Exception(parseDelimited("1. 2 3",' ',a)); 256 257 //Line contains too many fields 258 assertThrown!Exception(parseDelimited("1 2 3 4",' ',a)); 259 260 //Line is too short 261 assertThrown!Exception(parseDelimited("1 2",' ',a)); 262 263 //non-space/tab delimiter is fine 264 parseDelimited("1|2|3",'|',a); 265 assert(a[0]==1 && a[1]=="2" && a[2]==3); 266 parseDelimited("1| 2 |3",'|',a); 267 assert(a[0]==1 && a[1]==" 2 " && a[2]==3); 268 269 //Spaces are bad (and not ignored) if delimiter is not space (for numeric fields) 270 assertThrown!Exception(parseDelimited("1 |2|3",'|',a)); 271 assertThrown!Exception(parseDelimited(" 1|2|3",'|',a)); 272 assertThrown!Exception(parseDelimited(" 1|2| 3",'|',a)); 273 assertThrown!Exception(parseDelimited("1|2|3 ",'|',a)); 274 275 //For string fields, empty values are not OK (different from formattedRead()) 276 assertThrown!Exception(parseDelimited("1||3",'|',a)); 277 278 //For string fields, last value can't be empty (different from formattedRead()) 279 Tuple!(int,string,string) b; 280 assertThrown!Exception(parseDelimited("1|2|",'|',b)); 281 282 //One field is OK 283 Tuple!(string) c; 284 parseDelimited("foo",' ',c); 285 assert(c[0]=="foo"); 286 287 //Fields that are OK for floating-point types should not work for integers (extra characters) 288 Tuple!(real,int) d; 289 parseDelimited("4.5 9",' ',d); 290 assert(d[0]==4.5 && d[1]==9); 291 Tuple!(int,real) e; 292 assertThrown!Exception(parseDelimited("4.5 9",' ',e)); 293 294 //scientific notation - OK for floating-point types 295 Tuple!(double,double) f; 296 parseDelimited("-0.004e3 +4.3e10",' ',f); 297 assert(f[0]==-0.004e3 && f[1]==43e9); 298 299 //Scientific notation - fails for integars 300 Tuple!(int,int) g; 301 assertThrown!Exception(parseDelimited("-0.004e3 +4.3e10",' ',g)); 302 } 303 304 305 /** 306 Loads a delimited text file, line-by-line, parses the line into fields, and calls a delegate/function for each line. 307 308 Returns: 309 On success, the function returns nothing (void), the call back function have been called for every line. 310 311 Throws: 312 $(XREF std.exception.Exception) on failure to correctly parse a line. 313 $(XREF std.file.FileException) on I/O failures. 314 315 Example: 316 ---- 317 // Load a text file with three numeric columns, 318 // Store the tuple in an array 319 // (NOTE: this is a naive, inefficient way to populate an array, see NOTES) 320 alias Tuple!(int,int,int) T; 321 T[] t; 322 tabular!( T, // The number and types of the (expected) fields in the file 323 delegate(x) 324 { t ~= x; }, // for each line read, call this function. X will be of type T. 325 '\t' // The delimiter (default = TAB) 326 )("file.txt"); // The file name to read. 327 ---- 328 329 Example: 330 ---- 331 // Load a text file with three numeric columns, 332 // Use the second column as a KEY and the third column as the VALUE. 333 alias Tuple!(int,int,int) T; 334 int[int] data; 335 tabular!( T, // The number and types of the (expected) fields in the file 336 delegate(x) 337 { // for each line read, call this function. X will be of type T. 338 data[x[1]] = x[2] ; 339 }, 340 '\t' // The delimiter (default = TAB) 341 )("file.txt"); // The file name to read. 342 ---- 343 344 Notes: 345 $(OL 346 $(LI See $(LREF parseDelimited) for details about parsing the delimited lines of the fiile) 347 $(LO 348 ) 349 350 TODO: Make this an InputRange 351 352 */ 353 void tabular(Members, alias storeFunction, char delimiter='\t')(const string filename) 354 { 355 static assert (isTuple!Members,"tabular: 1st template parameter must be a Tuple with the expected columns in the file"); 356 357 auto f = File(filename); 358 scope(exit) f.close(); 359 auto lines=0; 360 361 alias unaryFun!storeFunction _Fun; 362 Members data; 363 364 import nxt.bylinefast: byLineFast; 365 foreach (origline; f.byLineFast()) 366 { 367 ++lines; 368 string line = origline.idup; 369 try 370 { 371 parseDelimited(line, delimiter, data); 372 _Fun(data); 373 } 374 catch ( Exception e ) 375 { 376 throw new FileException(filename,text("invalid input at line ", lines, 377 ": expected ", data.tupleof.length, 378 " fields ",typeof(data.tupleof).stringof, 379 " delimiter by '",quotemeta(delimiter), 380 "' got '", origline, 381 "' error details: ", e.msg )); 382 } 383 } 384 } 385 386 unittest { 387 import std.file ; 388 auto deleteme = testFilename(); 389 write(deleteme,"1 2 3\n4 5 6\n"); 390 scope(exit) 391 { assert(exists(deleteme)); remove(deleteme); } 392 393 //Load a text file, with three fields, delimiter with spaces. 394 alias Tuple!(int,int,int) T; 395 T[] t; 396 tabular!( T, // The number and types of the (expected) fields in the file 397 delegate(x) 398 { t ~= x; }, // for each line read, call this function. X will be of type T. 399 ' ' // The delimiter (default = TAB) 400 )(deleteme); // The file name to read. 401 assert(t.length==2); 402 assert(t[0] == tuple(1,2,3)); 403 assert(t[1] == tuple(4,5,6)); 404 405 //Any kind of invalid data should throw an exception 406 //NOTE: the delegate function does nothing, because we don't care about the data 407 // in this test. 408 //NOTE: see more test cases for failed parsing in the unittest of 'parseDelimited'. 409 auto deleteme2 = testFilename() ~ ".2"; 410 write(deleteme2,"1 Foo 3\n4 5 6\n"); // conversion will fail in the first line 411 scope(exit) 412 { assert(exists(deleteme2)); remove(deleteme2); } 413 assertThrown!Exception( tabular!( T, (x) => {}, ' ')(deleteme2)) ; 414 } 415 416 /** 417 Loads a delimited text file, line-by-line, parses the line into fields, returns an array of fields. 418 419 Returns: 420 On success, returns an array of tuples, based on template parameters. 421 422 Throws: 423 $(XREF std.exception.Exception) on failure to correctly parse a line. 424 $(XREF std.file.FileException) on I/O failures. 425 426 Example: 427 ---- 428 // Load a text file, tab-delimited, with three numeric columns. 429 430 auto data = tabularArray!('\t', int,int,int)("file.txt"); 431 432 // data[0] will be of type Tuple!(int,int,int) 433 ---- 434 */ 435 Select!(Types.length == 1, Types[0][], Tuple!(Types)[]) 436 tabularArray(char delimiter, Types...)(string filename) 437 { 438 alias RetT = typeof(return); 439 440 RetT result; 441 Appender!RetT app; 442 alias Members = ElementType!RetT; 443 444 tabular! ( Members, x => app.put(x) , delimiter ) (filename); 445 446 return app.data; 447 } 448 449 unittest { 450 import std.file ; 451 auto deleteme = testFilename() ~ ".3"; 452 write(deleteme,"1 2 3\n4 5 6\n"); 453 scope(exit) 454 { assert(exists(deleteme)); remove(deleteme); } 455 456 //Load a text file, with three fields, delimiter with spaces. 457 auto t = tabularArray!( ' ', // delimiter 458 int, int, int // expected fields in the text file 459 )(deleteme); 460 assert(t.length==2); 461 assert(t[0] == tuple(1,2,3)); 462 assert(t[1] == tuple(4,5,6)); 463 } 464 465 version (unittest) string testFilename(string file = __FILE__, size_t line = __LINE__) 466 { 467 import std.path; 468 import std.process: thisProcessID; 469 return text("deleteme-.", thisProcessID(), ".", baseName(file), ".", line); 470 } 471 472 /* 473 On Thursday, 16 May 2013 at 10:35:12 UTC, Dicebot wrote: 474 > Want to bring into discussion people that are not on Google+. 475 > Samuel recently has posted there some simple experiments with 476 > bioinformatics and bad performance of Phobos-based snippet has 477 > surprised me. 478 > 479 > I did explore issue a bit and reported results in a blog post 480 > (snippets are really small and simple) : 481 > http://dicebot.blogspot.com/2013/05/short-performance-tuning-story.html 482 > 483 > One open question remains though - can D/Phobos do better here? 484 > Can some changes be done to Phobos functions in question to 485 > improve performance or creating bioinformatics-specialized 486 > library is only practical solution? 487 488 I bet the problem is in readln. Currently, File.byLine() and 489 readln() are extremely slow, because they call fgetc() one char 490 at a time. 491 492 I made an "byLineFast" implementation some time ago that is 10x 493 faster than std.stdio.byLine. It reads lines through rawRead, and 494 using buffers instead of char by char. 495 496 I don't have the time to make it phobos-ready (unicode, etc.). 497 But I'll paste it here for any one to use (it works perfectly). 498 499 --jm 500 */