1 module nxt.bylinefast;
2 
3 import std.stdio : File;
4 import std.typecons : Flag;
5 
6 alias KeepTerminator = Flag!"keepTerminator";
7 
8 /**
9    Reads by line in an efficient way (10 times faster than File.byLine from
10    std.stdio).  This is accomplished by reading entire buffers (fgetc() is not
11    used), and allocating as little as possible.
12 
13    The char \n is considered as default separator, removing the previous \r if
14    it exists.
15 
16    The \n is never returned. The \r is not returned if it was
17    part of a \r\n (but it is returned if it was by itself).
18 
19    The returned string is always a substring of a temporary buffer, that must
20    not be stored. If necessary, you must use str[] or .dup or .idup to copy to
21    another string. DIP-25 return qualifier is used in front() to add extra
22    checks in @safe callers of front().
23 
24    Example:
25 
26    File f = File("file.txt");
27    foreach (string line; ByLineFast(f)) {
28    ...process line...
29    //Make a copy:
30    string copy = line[];
31    }
32 
33    The file isn't closed when done iterating, unless it was the only reference to
34    the file (same as std.stdio.byLine). (example: ByLineFast(File("file.txt"))).
35 */
36 struct ByLineFast(Char, Terminator)
37 {
38     import core.stdc.string : memmove;
39     import std.stdio : fgetc, ungetc;
40 
41     File file;
42     char[] line;
43     bool first_call = true;
44     char[] buffer;
45     char[] strBuffer;
46     const string separator;
47     KeepTerminator keepTerminator;
48 
49     this(File f,
50          KeepTerminator kt = KeepTerminator.no,
51          string separator = "\n",
52          uint bufferSize = 4096) @safe
53     {
54         assert(bufferSize > 0);
55         file = f;
56         this.separator = separator;
57         this.keepTerminator = kt;
58         buffer.length = bufferSize;
59     }
60 
61     @property bool empty() const @trusted scope
62     {
63         // Its important to check "line !is null" instead of
64         // "line.length != 0", otherwise, no empty lines can
65         // be returned, the iteration would be closed.
66         if (line !is null)
67         {
68             return false;
69         }
70         if (!file.isOpen)
71         {
72             // Clean the buffer to avoid pointer false positives:
73             (cast(char[])buffer)[] = 0;
74             return true;
75         }
76 
77         // First read. Determine if it's empty and put the char back.
78         auto mutableFP = (cast(File*) &file).getFP();
79         const c = fgetc(mutableFP);
80         if (c == -1)
81         {
82             // Clean the buffer to avoid pointer false positives:
83             (cast(char[])buffer)[] = 0;
84             return true;
85         }
86         if (ungetc(c, mutableFP) != c)
87         {
88             assert(0, "Bug in cstdlib implementation");
89         }
90         return false;
91     }
92 
93     @property char[] front() @safe return scope
94     {
95         if (first_call)
96         {
97             popFront();
98             first_call = false;
99         }
100         return line;
101     }
102 
103     void popFront() @trusted scope
104     {
105         if (strBuffer.length == 0)
106         {
107             strBuffer = file.rawRead(buffer);
108             if (strBuffer.length == 0)
109             {
110                 file.detach();
111                 line = null;
112                 return;
113             }
114         }
115 
116         // import std.string : indexOf; // TODO array_algorithm indexOf
117         import nxt.array_algorithm : indexOf;
118         const pos = strBuffer.indexOf(this.separator);
119         if (pos != -1)
120         {
121             if (pos != 0 && strBuffer[pos-1] == '\r')
122             {
123                 line = strBuffer[0 .. (pos-1)];
124             }
125             else
126             {
127                 line = strBuffer[0 .. pos];
128             }
129             // Pop the line, skipping the terminator:
130             strBuffer = strBuffer[(pos+1) .. $];
131         }
132         else
133         {
134             // More needs to be read here. Copy the tail of the buffer
135             // to the beginning, and try to read with the empty part of
136             // the buffer.
137             // If no buffer was left, extend the size of the buffer before
138             // reading. If the file has ended, then the line is the entire
139             // buffer.
140 
141             if (strBuffer.ptr != buffer.ptr)
142             {
143                 // Must use memmove because there might be overlap
144                 memmove(buffer.ptr, strBuffer.ptr,
145                         strBuffer.length * char.sizeof);
146             }
147             const spaceBegin = strBuffer.length;
148             if (strBuffer.length == buffer.length)
149             {
150                 // Must extend the buffer to keep reading.
151                 assumeSafeAppend(buffer);
152                 buffer.length = buffer.length * 2;
153             }
154             const readPart = file.rawRead(buffer[spaceBegin .. $]);
155             if (readPart.length == 0)
156             {
157                 // End of the file. Return whats in the buffer.
158                 // The next popFront() will try to read again, and then
159                 // mark empty condition.
160                 if (spaceBegin != 0 && buffer[spaceBegin-1] == '\r')
161                 {
162                     line = buffer[0 .. spaceBegin-1];
163                 }
164                 else
165                 {
166                     line = buffer[0 .. spaceBegin];
167                 }
168                 strBuffer = null;
169                 return;
170             }
171             strBuffer = buffer[0 .. spaceBegin + readPart.length];
172             // Now that we have new data in strBuffer, we can go on.
173             // If a line isn't found, the buffer will be extended again to read more.
174             popFront();
175         }
176     }
177 }
178 
179 auto byLineFast(Terminator = char,
180                 Char = char)(File f,
181                              KeepTerminator keepTerminator = KeepTerminator.no,
182                              string separator = "\n",
183                              uint bufferSize = 4096) @safe // TODO lookup preferred block type
184 {
185     return ByLineFast!(Char, Terminator)(f, keepTerminator, separator, bufferSize);
186 }
187 
188 version(none):
189 
190 version(linux)
191 unittest
192 {
193     import std.stdio: File, writeln;
194     import std.algorithm.searching: count;
195     import tempfs : tempFilePath;
196     import std.file : write;
197 
198     const path = tempFilePath("x");
199 
200     writeln(path);
201     File(path, "wb").write("a\n");
202 
203     assert(File(path, "rb").byLineFast.count ==
204            File(path, "rb").byLine.count);
205 }
206 
207 unittest
208 {
209     import std.stdio: File, writeln;
210     import std.algorithm.searching: count;
211 
212     const path = "/media/per/NORDLOW_2019-06-/Knowledge/DBpedia/latest/instance_types_en.ttl";
213 
214     import std.datetime: StopWatch;
215 
216     double d1, d2;
217 
218     {
219         StopWatch sw;
220         sw.start;
221         const c1 = File(path).byLine.count;
222         sw.stop;
223         d1 = sw.peek.msecs;
224         writeln("byLine: ", d1, "msecs");
225     }
226 
227     {
228         StopWatch sw;
229         sw.start;
230         const c2 = File(path).byLineFast.count;
231         sw.stop;
232         d2 = sw.peek.msecs;
233         writeln("byLineFast: ", d2, "msecs");
234     }
235 
236     writeln("Speed-Up: ", d1 / d2);
237 }