1 module nxt.bylinefast; 2 3 import std.stdio : File; 4 import std.typecons : Flag; 5 6 alias KeepTerminator = Flag!"keepTerminator"; 7 8 /** 9 Reads by line in an efficient way (10 times faster than File.byLine from 10 std.stdio). This is accomplished by reading entire buffers (fgetc() is not 11 used), and allocating as little as possible. 12 13 The char \n is considered as default separator, removing the previous \r if 14 it exists. 15 16 The \n is never returned. The \r is not returned if it was 17 part of a \r\n (but it is returned if it was by itself). 18 19 The returned string is always a substring of a temporary buffer, that must 20 not be stored. If necessary, you must use str[] or .dup or .idup to copy to 21 another string. DIP-25 return qualifier is used in front() to add extra 22 checks in @safe callers of front(). 23 24 Example: 25 26 File f = File("file.txt"); 27 foreach (string line; ByLineFast(f)) { 28 ...process line... 29 //Make a copy: 30 string copy = line[]; 31 } 32 33 The file isn't closed when done iterating, unless it was the only reference to 34 the file (same as std.stdio.byLine). (example: ByLineFast(File("file.txt"))). 35 */ 36 struct ByLineFast(Char, Terminator) 37 { 38 import core.stdc.string : memmove; 39 import std.stdio : fgetc, ungetc; 40 41 File file; 42 char[] line; 43 bool first_call = true; 44 char[] buffer; 45 char[] strBuffer; 46 const string separator; 47 KeepTerminator keepTerminator; 48 49 this(File f, 50 KeepTerminator kt = KeepTerminator.no, 51 string separator = "\n", 52 uint bufferSize = 4096) @safe 53 { 54 assert(bufferSize > 0); 55 file = f; 56 this.separator = separator; 57 this.keepTerminator = kt; 58 buffer.length = bufferSize; 59 } 60 61 bool empty() const @property @trusted scope 62 { 63 // Its important to check "line !is null" instead of 64 // "line.length != 0", otherwise, no empty lines can 65 // be returned, the iteration would be closed. 66 if (line !is null) 67 { 68 return false; 69 } 70 if (!file.isOpen) 71 { 72 // Clean the buffer to avoid pointer false positives: 73 (cast(char[])buffer)[] = 0; 74 return true; 75 } 76 77 // First read. Determine if it's empty and put the char back. 78 auto mutableFP = (cast(File*) &file).getFP(); 79 const c = fgetc(mutableFP); 80 if (c == -1) 81 { 82 // Clean the buffer to avoid pointer false positives: 83 (cast(char[])buffer)[] = 0; 84 return true; 85 } 86 if (ungetc(c, mutableFP) != c) 87 { 88 assert(0, "Bug in cstdlib implementation"); 89 } 90 return false; 91 } 92 93 @property char[] front() @safe return scope 94 { 95 if (first_call) 96 { 97 popFront(); 98 first_call = false; 99 } 100 return line; 101 } 102 103 void popFront() @trusted scope 104 { 105 if (strBuffer.length == 0) 106 { 107 strBuffer = file.rawRead(buffer); 108 if (strBuffer.length == 0) 109 { 110 file.detach(); 111 line = null; 112 return; 113 } 114 } 115 116 // import std.string : indexOf; /+ TODO: algorithm indexOf +/ 117 import nxt.algorithm.searching : indexOf; 118 const pos = strBuffer.indexOf(this.separator); 119 if (pos != -1) 120 { 121 if (pos != 0 && strBuffer[pos-1] == '\r') 122 { 123 line = strBuffer[0 .. (pos-1)]; 124 } 125 else 126 { 127 line = strBuffer[0 .. pos]; 128 } 129 // Pop the line, skipping the terminator: 130 strBuffer = strBuffer[(pos+1) .. $]; 131 } 132 else 133 { 134 // More needs to be read here. Copy the tail of the buffer 135 // to the beginning, and try to read with the empty part of 136 // the buffer. 137 // If no buffer was left, extend the size of the buffer before 138 // reading. If the file has ended, then the line is the entire 139 // buffer. 140 141 if (strBuffer.ptr != buffer.ptr) 142 { 143 // Must use memmove because there might be overlap 144 memmove(buffer.ptr, strBuffer.ptr, 145 strBuffer.length * char.sizeof); 146 } 147 const spaceBegin = strBuffer.length; 148 if (strBuffer.length == buffer.length) 149 { 150 // Must extend the buffer to keep reading. 151 assumeSafeAppend(buffer); 152 buffer.length = buffer.length * 2; 153 } 154 const readPart = file.rawRead(buffer[spaceBegin .. $]); 155 if (readPart.length == 0) 156 { 157 // End of the file. Return whats in the buffer. 158 // The next popFront() will try to read again, and then 159 // mark empty condition. 160 if (spaceBegin != 0 && buffer[spaceBegin-1] == '\r') 161 { 162 line = buffer[0 .. spaceBegin-1]; 163 } 164 else 165 { 166 line = buffer[0 .. spaceBegin]; 167 } 168 strBuffer = null; 169 return; 170 } 171 strBuffer = buffer[0 .. spaceBegin + readPart.length]; 172 // Now that we have new data in strBuffer, we can go on. 173 // If a line isn't found, the buffer will be extended again to read more. 174 popFront(); 175 } 176 } 177 } 178 179 auto byLineFast(Terminator = char, 180 Char = char)(File f, 181 KeepTerminator keepTerminator = KeepTerminator.no, 182 string separator = "\n", 183 uint bufferSize = 4096) @safe /+ TODO: lookup preferred block type +/ 184 { 185 return ByLineFast!(Char, Terminator)(f, keepTerminator, separator, bufferSize); 186 } 187 188 version (none): 189 190 version (linux) 191 unittest { 192 import std.stdio: File, writeln; 193 import std.algorithm.searching: count; 194 import tempfs : tempFilePath; 195 import std.file : write; 196 197 const path = tempFilePath("x"); 198 199 writeln(path); 200 File(path, "wb").write("a\n"); 201 202 assert(File(path, "rb").byLineFast.count == 203 File(path, "rb").byLine.count); 204 } 205 206 unittest { 207 import std.stdio: File, writeln; 208 import std.algorithm.searching: count; 209 210 const path = "/media/per/NORDLOW_2019-06-/Knowledge/DBpedia/latest/instance_types_en.ttl"; 211 212 import std.datetime: StopWatch; 213 214 double d1, d2; 215 216 { 217 StopWatch sw; 218 sw.start; 219 const c1 = File(path).byLine.count; 220 sw.stop; 221 d1 = sw.peek.msecs; 222 writeln("byLine: ", d1, "msecs"); 223 } 224 225 { 226 StopWatch sw; 227 sw.start; 228 const c2 = File(path).byLineFast.count; 229 sw.stop; 230 d2 = sw.peek.msecs; 231 writeln("byLineFast: ", d2, "msecs"); 232 } 233 234 writeln("Speed-Up: ", d1 / d2); 235 }