1 module nxt.bylinefast;
2 
3 import std.stdio : File;
4 import std.typecons : Flag;
5 
6 alias KeepTerminator = Flag!"keepTerminator";
7 
8 /**
9    Reads by line in an efficient way (10 times faster than File.byLine from
10    std.stdio).  This is accomplished by reading entire buffers (fgetc() is not
11    used), and allocating as little as possible.
12 
13    The char \n is considered as default separator, removing the previous \r if
14    it exists.
15 
16    The \n is never returned. The \r is not returned if it was
17    part of a \r\n (but it is returned if it was by itself).
18 
19    The returned string is always a substring of a temporary buffer, that must
20    not be stored. If necessary, you must use str[] or .dup or .idup to copy to
21    another string. DIP-25 return qualifier is used in front() to add extra
22    checks in @safe callers of front().
23 
24    Example:
25 
26    File f = File("file.txt");
27    foreach (string line; ByLineFast(f)) {
28    ...process line...
29    //Make a copy:
30    string copy = line[];
31    }
32 
33    The file isn't closed when done iterating, unless it was the only reference to
34    the file (same as std.stdio.byLine). (example: ByLineFast(File("file.txt"))).
35 */
36 struct ByLineFast(Char, Terminator)
37 {
38 	import core.stdc.string : memmove;
39 	import std.stdio : fgetc, ungetc;
40 
41 	File file;
42 	char[] line;
43 	bool first_call = true;
44 	char[] buffer;
45 	char[] strBuffer;
46 	const string separator;
47 	KeepTerminator keepTerminator;
48 
49 	this(File f,
50 		 KeepTerminator kt = KeepTerminator.no,
51 		 string separator = "\n",
52 		 uint bufferSize = 4096) @safe
53 	{
54 		assert(bufferSize > 0);
55 		file = f;
56 		this.separator = separator;
57 		this.keepTerminator = kt;
58 		buffer.length = bufferSize;
59 	}
60 
61 	bool empty() const @property @trusted scope
62 	{
63 		// Its important to check "line !is null" instead of
64 		// "line.length != 0", otherwise, no empty lines can
65 		// be returned, the iteration would be closed.
66 		if (line !is null)
67 		{
68 			return false;
69 		}
70 		if (!file.isOpen)
71 		{
72 			// Clean the buffer to avoid pointer false positives:
73 			(cast(char[])buffer)[] = 0;
74 			return true;
75 		}
76 
77 		// First read. Determine if it's empty and put the char back.
78 		auto mutableFP = (cast(File*) &file).getFP();
79 		const c = fgetc(mutableFP);
80 		if (c == -1)
81 		{
82 			// Clean the buffer to avoid pointer false positives:
83 			(cast(char[])buffer)[] = 0;
84 			return true;
85 		}
86 		if (ungetc(c, mutableFP) != c)
87 		{
88 			assert(0, "Bug in cstdlib implementation");
89 		}
90 		return false;
91 	}
92 
93 	@property char[] front() @safe return scope
94 	{
95 		if (first_call)
96 		{
97 			popFront();
98 			first_call = false;
99 		}
100 		return line;
101 	}
102 
103 	void popFront() @trusted scope
104 	{
105 		if (strBuffer.length == 0)
106 		{
107 			strBuffer = file.rawRead(buffer);
108 			if (strBuffer.length == 0)
109 			{
110 				file.detach();
111 				line = null;
112 				return;
113 			}
114 		}
115 
116 		// import std.string : indexOf; /+ TODO: algorithm indexOf +/
117 		import nxt.algorithm.searching : indexOf;
118 		const pos = strBuffer.indexOf(this.separator);
119 		if (pos != -1)
120 		{
121 			if (pos != 0 && strBuffer[pos-1] == '\r')
122 			{
123 				line = strBuffer[0 .. (pos-1)];
124 			}
125 			else
126 			{
127 				line = strBuffer[0 .. pos];
128 			}
129 			// Pop the line, skipping the terminator:
130 			strBuffer = strBuffer[(pos+1) .. $];
131 		}
132 		else
133 		{
134 			// More needs to be read here. Copy the tail of the buffer
135 			// to the beginning, and try to read with the empty part of
136 			// the buffer.
137 			// If no buffer was left, extend the size of the buffer before
138 			// reading. If the file has ended, then the line is the entire
139 			// buffer.
140 
141 			if (strBuffer.ptr != buffer.ptr)
142 			{
143 				// Must use memmove because there might be overlap
144 				memmove(buffer.ptr, strBuffer.ptr,
145 						strBuffer.length * char.sizeof);
146 			}
147 			const spaceBegin = strBuffer.length;
148 			if (strBuffer.length == buffer.length)
149 			{
150 				// Must extend the buffer to keep reading.
151 				assumeSafeAppend(buffer);
152 				buffer.length = buffer.length * 2;
153 			}
154 			const readPart = file.rawRead(buffer[spaceBegin .. $]);
155 			if (readPart.length == 0)
156 			{
157 				// End of the file. Return whats in the buffer.
158 				// The next popFront() will try to read again, and then
159 				// mark empty condition.
160 				if (spaceBegin != 0 && buffer[spaceBegin-1] == '\r')
161 				{
162 					line = buffer[0 .. spaceBegin-1];
163 				}
164 				else
165 				{
166 					line = buffer[0 .. spaceBegin];
167 				}
168 				strBuffer = null;
169 				return;
170 			}
171 			strBuffer = buffer[0 .. spaceBegin + readPart.length];
172 			// Now that we have new data in strBuffer, we can go on.
173 			// If a line isn't found, the buffer will be extended again to read more.
174 			popFront();
175 		}
176 	}
177 }
178 
179 auto byLineFast(Terminator = char,
180 				Char = char)(File f,
181 							 KeepTerminator keepTerminator = KeepTerminator.no,
182 							 string separator = "\n",
183 							 uint bufferSize = 4096) @safe /+ TODO: lookup preferred block type +/
184 {
185 	return ByLineFast!(Char, Terminator)(f, keepTerminator, separator, bufferSize);
186 }
187 
188 version (none):
189 
190 version (linux)
191 unittest {
192 	import std.stdio: File, writeln;
193 	import std.algorithm.searching: count;
194 	import nxt.file : tempFile;
195 	import std.file : write;
196 
197 	const path = tempFile("x");
198 
199 	writeln(path);
200 	File(path, "wb").write("a\n");
201 
202 	assert(File(path, "rb").byLineFast.count ==
203 		   File(path, "rb").byLine.count);
204 }
205 
206 unittest {
207 	import std.stdio: File, writeln;
208 	import std.algorithm.searching: count;
209 
210 	const path = "/media/per/NORDLOW_2019-06-/Knowledge/DBpedia/latest/instance_types_en.ttl";
211 
212 	import std.datetime: StopWatch;
213 
214 	double d1, d2;
215 
216 	{
217 		StopWatch sw;
218 		sw.start;
219 		const c1 = File(path).byLine.count;
220 		sw.stop;
221 		d1 = sw.peek.msecs;
222 		writeln("byLine: ", d1, "msecs");
223 	}
224 
225 	{
226 		StopWatch sw;
227 		sw.start;
228 		const c2 = File(path).byLineFast.count;
229 		sw.stop;
230 		d2 = sw.peek.msecs;
231 		writeln("byLineFast: ", d2, "msecs");
232 	}
233 
234 	writeln("Speed-Up: ", d1 / d2);
235 }