1 /** RDF-data model and algorithms.
2  *
3  * Currently supports N-Triples (.nt).
4  *
5  * Planned support for RDF Turtle (.ttl) statements (either single-line or multi-line).
6  *
7  * TODO: can we make inout operator only on the members of the returned `NTriple` in `parseNTriple`?
8  * TODO: parse Turtle .ttl-files (https://en.wikipedia.org/wiki/Turtle_(syntax))
9  * TODO: parse N-Quads for use in Wikidata
10  * TODO: parse RDF/XML
11  *
12  * See_Also: https://en.wikipedia.org/wiki/Resource_Description_Framework
13  * See_Also: https://en.wikipedia.org/wiki/Turtle_(syntax)
14  * See_Also: https://en.wikipedia.org/wiki/N-Triples#N-Quads
15  *
16  * See_Also: https://www.ida.liu.se/~robke04/include/publications.shtml
17  *
18  * TODO: decode `subject` and `object` (in `Db.exprURI`) only when their types are IRIs?
19  */
20 module nxt.rdf;
21 
22 pure nothrow @safe @nogc:
23 
24 /++ Subject format. +/
25 enum SubjectFormat {
26 	IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier
27 	blankNode
28 }
29 
30 /++ Object format. +/
31 enum ObjectFormat {
32 	IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier
33 	blankNode,
34 	literal
35 }
36 
37 /** RDF N-Triple (data model).
38  *
39  * Parameterized on element type $(D Chars). Use NTriple!(char[]) to avoid
40  * GC-allocations when parsing files using File.byLine which returns a volatile
41  * reference to a temporary char[] buffer. If The NTriples are to be stored
42  * permanently in memory use NTriple!string.
43  *
44  * See_Also: https://en.wikipedia.org/wiki/N-Triples
45  */
46 struct NTriple {
47 	import nxt.algorithm.searching : skipOver, skipOverBack, startsWith, endsWith;
48 
49 	alias Chars = const(char)[];
50 
51 	/** Parse `subject`, `predicate` and `object`.
52 	 *
53 	 * Fails for:
54 	 * - subject: <http://dbpedia.org/resource/CT_Rei_Pel%C3%A9>
55 	 * - predicate: <http://xmlns.com/foaf/0.1/homepage>
56 	 * - object: <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9>
57 	 */
58 	void parse() scope pure nothrow @safe @nogc {
59 		// subject: Standard: https://www.w3.org/TR/n-triples/#grammar-production-subject
60 		if (subject.skipOver('<')) { // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF)
61 			const ok = subject.skipOverBack('>');
62 			assert(ok);
63 			subjectFormat = SubjectFormat.IRI;
64 		} else  // BLANK_NODE_LABEL
65 			subjectFormat = SubjectFormat.blankNode;
66 
67 		// predicate: Standard: https://www.w3.org/TR/n-triples/#grammar-production-predicate
68 		assert(predicate.startsWith('<'));
69 		assert(predicate.endsWith('>'));
70 		predicate = predicate[1 .. $ - 1]; // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF)
71 
72 		// object: Standard: https://www.w3.org/TR/n-triples/#grammar-production-object
73 		if (object.skipOver('<')) { // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF)
74 			const ok = object.skipOverBack('>');
75 			assert(ok);
76 			objectFormat = ObjectFormat.IRI;
77 		} else if (object.skipOver('"')) { // literal (https://www.w3.org/TR/n-triples/#grammar-production-literal)
78 			// import std.ascii : isLower;
79 			import nxt.algorithm.searching : findSplit;
80 			if (const split = object.findSplit(`"@`)) {
81 				() @trusted { // TODO: -dip1000 without @trusted
82 					objectLanguageCode = split.post;
83 					object = split.pre;
84 				}();
85 			} else if (auto hit = object.findSplit(`"^^`)) {
86 				const objectdataType = hit.post;
87 				assert(objectdataType.startsWith('<'));
88 				assert(objectdataType.endsWith('>'));
89 				() @trusted { // TODO: -dip1000 without @trusted
90 					objectDataTypeIRI = objectdataType[1 .. $ - 1];
91 					object = hit.pre;
92 				}();
93 			} else {
94 				const ok = object.skipOverBack('"');
95 				if (!ok)
96 					assert(0, "No matching double-quote in object ");
97 				assert(ok);
98 			}
99 
100 			// dbg(`object:"`, object, `" lang:"`, objectLanguageCode, `" typeIRI:"`, objectDataTypeIRI, `"`);
101 			objectFormat = ObjectFormat.literal;
102 		} else { // BLANK_NODE_LABEL (https://www.w3.org/TR/n-triples/#grammar-production-BLANK_NODE_LABEL)
103 			objectFormat = ObjectFormat.blankNode;
104 		}
105 	}
106 
107 	Chars subject;
108 	Chars predicate;
109 
110 	Chars object;
111 	Chars objectLanguageCode;
112 	Chars objectDataTypeIRI;
113 
114 	SubjectFormat subjectFormat;
115 	ObjectFormat objectFormat;
116 }
117 
118 /** Decode `line` into an RDF N-Triple.
119  *
120  * See_Also: https://www.w3.org/TR/n-triples/
121  */
122 auto parseNTriple(scope return inout(char)[] line) {
123 	debug const originalLine = line;
124 	import nxt.algorithm.searching : skipOverBack, indexOf;
125 
126 	debug assert(line.length >= 4, `Failed to parse too short: "` ~ originalLine ~ `"`);
127 
128 	// strip suffix
129 	line.skipOverBack('.');
130 	line.skipOverBack(' ');
131 
132 	// subject IRI
133 	const ix0 = line.indexOf(' '); /+ TODO: use algorithm.findSplit(' ') +/
134 	debug assert(ix0 != -1, `Failed to parse: "` ~ originalLine ~ `"`);
135 	const subject = line[0 .. ix0];
136 	line = line[ix0 + 1 .. $];
137 
138 	// predicate IRI
139 	const ix1 = line.indexOf(' '); /+ TODO: use algorithm.findSplit(' ') +/
140 	debug assert(ix1 != -1, `Failed to parse: "` ~ originalLine ~ `"`);
141 	const predicate = line[0 .. ix1];
142 	line = line[ix1 + 1 .. $];
143 
144 	auto nt = inout(NTriple)(subject, predicate, line);
145 	(cast(NTriple)nt).parse();  // hack to make `inout` work
146 	return nt;
147 }
148 
149 ///
150 pure nothrow @safe @nogc unittest {
151 	const x = `<http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)> <http://dbpedia.org/ontology/artist> <http://dbpedia.org/resource/Gerardo_Mej%C3%ADa> .`;
152 	auto nt = x.parseNTriple;
153 	static assert(is(typeof(nt.subject) == immutable(string))); /+ TODO: should be `string` or `const(char)[]` +/
154 	static assert(is(typeof(nt.predicate) == immutable(string))); /+ TODO: should be `string` or `const(char)[]` +/
155 	static assert(is(typeof(nt.object) == immutable(string))); /+ TODO: should be `string` or `const(char)[]` +/
156 	assert(nt.subject == `http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)`);
157 	assert(nt.subjectFormat == SubjectFormat.IRI);
158 	assert(nt.predicate == `http://dbpedia.org/ontology/artist`);
159 	assert(nt.object == `http://dbpedia.org/resource/Gerardo_Mej%C3%ADa`);
160 	assert(nt.objectLanguageCode is null);
161 	assert(nt.objectDataTypeIRI is null);
162 	assert(nt.objectFormat == ObjectFormat.IRI);
163 }
164 
165 ///
166 pure nothrow @safe @nogc unittest {
167 	const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup"@en .`;
168 	const nt = x.parseNTriple;
169 	assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`);
170 	assert(nt.subjectFormat == SubjectFormat.IRI);
171 	assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`);
172 	assert(nt.object == `Chatham Cup`);
173 	assert(nt.objectLanguageCode == `en`);
174 	assert(nt.objectDataTypeIRI is null);
175 	assert(nt.objectFormat == ObjectFormat.literal);
176 }
177 
178 ///
179 pure nothrow @safe @nogc unittest {
180 	const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup" .`;
181 	const nt = x.parseNTriple;
182 	assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`);
183 	assert(nt.subjectFormat == SubjectFormat.IRI);
184 	assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`);
185 	assert(nt.object == `Chatham Cup`);
186 	assert(nt.objectLanguageCode is null);
187 	assert(nt.objectDataTypeIRI is null);
188 	assert(nt.objectFormat == ObjectFormat.literal);
189 }
190 
191 ///
192 pure nothrow @safe @nogc unittest {
193 	const x = `<http://dbpedia.org/resource/007:_Quantum_of_Solace> <http://dbpedia.org/ontology/releaseDate> "2008-10-31"^^<http://www.w3.org/2001/XMLSchema#date> .`;
194 	const nt = x.parseNTriple;
195 	assert(nt.subject == `http://dbpedia.org/resource/007:_Quantum_of_Solace`);
196 	assert(nt.subjectFormat == SubjectFormat.IRI);
197 	assert(nt.predicate == `http://dbpedia.org/ontology/releaseDate`);
198 	assert(nt.object == `2008-10-31`);
199 	assert(nt.objectLanguageCode is null);
200 	assert(nt.objectDataTypeIRI == `http://www.w3.org/2001/XMLSchema#date`);
201 	assert(nt.objectFormat == ObjectFormat.literal);
202 }
203 
204 ///
205 pure nothrow @safe @nogc unittest {
206 	const x = `<http://dbpedia.org/resource/Ceremony_(song)> <http://dbpedia.org/ontology/bSide> "\"In a Lonely Place\"".`;
207 	const nt = x.parseNTriple;
208 	assert(nt.subject == `http://dbpedia.org/resource/Ceremony_(song)`);
209 	assert(nt.subjectFormat == SubjectFormat.IRI);
210 	assert(nt.predicate == `http://dbpedia.org/ontology/bSide`);
211 	assert(nt.object == `\"In a Lonely Place\"`); // to be unescaped
212 	assert(nt.objectLanguageCode is null);
213 	assert(nt.objectDataTypeIRI is null);
214 	assert(nt.objectFormat == ObjectFormat.literal);
215 }
216 
217 ///
218 pure nothrow @safe @nogc unittest {
219 	const x = `<http://dbpedia.org/resource/16_@_War> <http://xmlns.com/foaf/0.1/name> "16 @ War"@en .`;
220 	auto nt = x.parseNTriple;
221 	assert(nt.subject == `http://dbpedia.org/resource/16_@_War`);
222 	assert(nt.subjectFormat == SubjectFormat.IRI);
223 	assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`);
224 	assert(nt.object == `16 @ War`);
225 	assert(nt.objectLanguageCode == `en`);
226 	assert(nt.objectDataTypeIRI is null);
227 	assert(nt.objectFormat == ObjectFormat.literal);
228 }
229 
230 ///
231 pure nothrow @safe @nogc unittest {
232 	const x = `<http://dbpedia.org/resource/CT_Rei_Pel%C3%A9> <http://xmlns.com/foaf/0.1/homepage> <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9> .`;
233 	auto nt = x.parseNTriple;
234 	assert(nt.subjectFormat == SubjectFormat.IRI);
235 	assert(nt.subject == `http://dbpedia.org/resource/CT_Rei_Pel%C3%A9`);
236 	assert(nt.predicate == `http://xmlns.com/foaf/0.1/homepage`);
237 	assert(nt.object == `http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9`);
238 	assert(nt.objectFormat == ObjectFormat.IRI);
239 }