1 /** RDF-data model and algorithsm.
2  *
3  * Currently supports N-Triples (.nt).
4  *
5  * Planned support for RDF Turtle (.ttl) statements (either single-line or multi-line).
6  *
7  * TODO can we make inout operator only on the members of the returned `NTriple` in `parseNTriple`?
8  * TODO parse Turtle .ttl-files (https://en.wikipedia.org/wiki/Turtle_(syntax))
9  * TODO parse N-Quads for use in Wikidata
10  * TODO parse RDF/XML
11  *
12  * See_Also: https://en.wikipedia.org/wiki/Resource_Description_Framework
13  * See_Also: https://en.wikipedia.org/wiki/Turtle_(syntax)
14  * See_Also: https://en.wikipedia.org/wiki/N-Triples#N-Quads
15  *
16  * See_Also: https://www.ida.liu.se/~robke04/include/publications.shtml
17  *
18  * TODO decode `subject` and `object` (in `Db.exprURI`) only when their types are IRIs?
19  */
20 module nxt.rdf;
21 
22 enum SubjectFormat
23 {
24     IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier
25     blankNode
26 }
27 enum ObjectFormat
28 {
29     IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier
30     blankNode,
31     literal
32 }
33 
34 @safe pure nothrow @nogc:
35 
36 /** RDF N-Triple (data model).
37  *
38  * Parameterized on element type $(D Chars). Use NTriple!(char[]) to avoid
39  * GC-allocations when parsing files using File.byLine which returns a volatile
40  * reference to a temporary char[] buffer. If The NTriples are to be stored
41  * permanently in memory use NTriple!string.
42  *
43  * See_Also: https://en.wikipedia.org/wiki/N-Triples
44  */
45 struct NTriple
46 {
47     import nxt.array_algorithm : skipOver, skipOverBack, startsWith, endsWith;
48 
49     alias Chars = const(char)[];
50 
51     /** Parse `subject`, `predicate` and `object`.
52      *
53      * Fails for:
54      * - subject: <http://dbpedia.org/resource/CT_Rei_Pel%C3%A9>
55      * - predicate: <http://xmlns.com/foaf/0.1/homepage>
56      * - object: <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9>
57      */
58     void parse() @safe pure scope nothrow @nogc
59     {
60         // subject: Standard: https://www.w3.org/TR/n-triples/#grammar-production-subject
61         if (subject.skipOver('<')) // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF)
62         {
63             const ok = subject.skipOverBack('>');
64             assert(ok);
65             subjectFormat = SubjectFormat.IRI;
66         }
67         else                // BLANK_NODE_LABEL
68         {
69             subjectFormat = SubjectFormat.blankNode;
70         }
71 
72         // predicate: Standard: https://www.w3.org/TR/n-triples/#grammar-production-predicate
73         assert(predicate.startsWith('<'));
74         assert(predicate.endsWith('>'));
75         predicate = predicate[1 .. $ - 1]; // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF)
76 
77         // object: Standard: https://www.w3.org/TR/n-triples/#grammar-production-object
78         if (object.skipOver('<')) // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF)
79         {
80             const ok = object.skipOverBack('>');
81             assert(ok);
82             objectFormat = ObjectFormat.IRI;
83         }
84         else if (object.skipOver('"')) // literal (https://www.w3.org/TR/n-triples/#grammar-production-literal)
85         {
86             // import std.ascii : isLower;
87             import nxt.array_algorithm : findSplit;
88             if (const split = object.findSplit(`"@`))
89             {
90                 objectLanguageCode = split.post;
91                 object = split.pre;
92             }
93             else if (auto hit = object.findSplit(`"^^`))
94             {
95                 const objectdataType = hit.post;
96                 assert(objectdataType.startsWith('<'));
97                 assert(objectdataType.endsWith('>'));
98                 objectDataTypeIRI = objectdataType[1 .. $ - 1];
99                 object = hit.pre;
100             }
101             else
102             {
103                 const ok = object.skipOverBack('"');
104                 if (!ok)
105                 {
106                     assert("No matching double-quote in object ");
107                 }
108                 assert(ok);
109             }
110 
111             // dbg(`object:"`, object, `" lang:"`, objectLanguageCode, `" typeIRI:"`, objectDataTypeIRI, `"`);
112             objectFormat = ObjectFormat.literal;
113         }
114         else                // BLANK_NODE_LABEL (https://www.w3.org/TR/n-triples/#grammar-production-BLANK_NODE_LABEL)
115         {
116             objectFormat = ObjectFormat.blankNode;
117         }
118     }
119 
120     Chars subject;
121     Chars predicate;
122 
123     Chars object;
124     Chars objectLanguageCode;
125     Chars objectDataTypeIRI;
126 
127     SubjectFormat subjectFormat;
128     ObjectFormat objectFormat;
129 }
130 
131 /** Decode `line` into an RDF N-Triple.
132  *
133  * See_Also: https://www.w3.org/TR/n-triples/
134  */
135 auto parseNTriple(scope return inout(char)[] line)
136 {
137     debug const originalLine = line;
138     import nxt.array_algorithm : skipOverBack, indexOf;
139 
140     debug assert(line.length >= 4, `Failed to parse too short: "` ~ originalLine ~ `"`);
141 
142     // strip suffix
143     line.skipOverBack('.');
144     line.skipOverBack(' ');
145 
146     // subject IRI
147     const ix0 = line.indexOf(' '); // TODO use array_algorithm.findSplit(' ')
148     debug assert(ix0 != -1, `Failed to parse: "` ~ originalLine ~ `"`);
149     const subject = line[0 .. ix0];
150     line = line[ix0 + 1 .. $];
151 
152     // predicate IRI
153     const ix1 = line.indexOf(' '); // TODO use array_algorithm.findSplit(' ')
154     debug assert(ix1 != -1, `Failed to parse: "` ~ originalLine ~ `"`);
155     const predicate = line[0 .. ix1];
156     line = line[ix1 + 1 .. $];
157 
158     auto nt = inout(NTriple)(subject, predicate, line);
159     (cast(NTriple)nt).parse();  // hack to make `inout` work
160     return nt;
161 }
162 
163 ///
164 @safe pure nothrow @nogc unittest
165 {
166     const x = `<http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)> <http://dbpedia.org/ontology/artist> <http://dbpedia.org/resource/Gerardo_Mej%C3%ADa> .`;
167     auto nt = x.parseNTriple;
168     static assert(is(typeof(nt.subject) == immutable(string))); // TODO should be `string` or `const(char)[]`
169     static assert(is(typeof(nt.predicate) == immutable(string))); // TODO should be `string` or `const(char)[]`
170     static assert(is(typeof(nt.object) == immutable(string))); // TODO should be `string` or `const(char)[]`
171     assert(nt.subject == `http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)`);
172     assert(nt.subjectFormat == SubjectFormat.IRI);
173     assert(nt.predicate == `http://dbpedia.org/ontology/artist`);
174     assert(nt.object == `http://dbpedia.org/resource/Gerardo_Mej%C3%ADa`);
175     assert(nt.objectLanguageCode is null);
176     assert(nt.objectDataTypeIRI is null);
177     assert(nt.objectFormat == ObjectFormat.IRI);
178 }
179 
180 ///
181 @safe pure nothrow @nogc unittest
182 {
183     const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup"@en .`;
184     const nt = x.parseNTriple;
185     assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`);
186     assert(nt.subjectFormat == SubjectFormat.IRI);
187     assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`);
188     assert(nt.object == `Chatham Cup`);
189     assert(nt.objectLanguageCode == `en`);
190     assert(nt.objectDataTypeIRI is null);
191     assert(nt.objectFormat == ObjectFormat.literal);
192 }
193 
194 ///
195 @safe pure nothrow @nogc unittest
196 {
197     const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup" .`;
198     const nt = x.parseNTriple;
199     assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`);
200     assert(nt.subjectFormat == SubjectFormat.IRI);
201     assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`);
202     assert(nt.object == `Chatham Cup`);
203     assert(nt.objectLanguageCode is null);
204     assert(nt.objectDataTypeIRI is null);
205     assert(nt.objectFormat == ObjectFormat.literal);
206 }
207 
208 ///
209 @safe pure nothrow @nogc unittest
210 {
211     const x = `<http://dbpedia.org/resource/007:_Quantum_of_Solace> <http://dbpedia.org/ontology/releaseDate> "2008-10-31"^^<http://www.w3.org/2001/XMLSchema#date> .`;
212     const nt = x.parseNTriple;
213     assert(nt.subject == `http://dbpedia.org/resource/007:_Quantum_of_Solace`);
214     assert(nt.subjectFormat == SubjectFormat.IRI);
215     assert(nt.predicate == `http://dbpedia.org/ontology/releaseDate`);
216     assert(nt.object == `2008-10-31`);
217     assert(nt.objectLanguageCode is null);
218     assert(nt.objectDataTypeIRI == `http://www.w3.org/2001/XMLSchema#date`);
219     assert(nt.objectFormat == ObjectFormat.literal);
220 }
221 
222 ///
223 @safe pure nothrow @nogc unittest
224 {
225     const x = `<http://dbpedia.org/resource/Ceremony_(song)> <http://dbpedia.org/ontology/bSide> "\"In a Lonely Place\"".`;
226     const nt = x.parseNTriple;
227     assert(nt.subject == `http://dbpedia.org/resource/Ceremony_(song)`);
228     assert(nt.subjectFormat == SubjectFormat.IRI);
229     assert(nt.predicate == `http://dbpedia.org/ontology/bSide`);
230     assert(nt.object == `\"In a Lonely Place\"`); // to be unescaped
231     assert(nt.objectLanguageCode is null);
232     assert(nt.objectDataTypeIRI is null);
233     assert(nt.objectFormat == ObjectFormat.literal);
234 }
235 
236 ///
237 @safe pure nothrow @nogc unittest
238 {
239     const x = `<http://dbpedia.org/resource/16_@_War> <http://xmlns.com/foaf/0.1/name> "16 @ War"@en .`;
240     auto nt = x.parseNTriple;
241     assert(nt.subject == `http://dbpedia.org/resource/16_@_War`);
242     assert(nt.subjectFormat == SubjectFormat.IRI);
243     assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`);
244     assert(nt.object == `16 @ War`);
245     assert(nt.objectLanguageCode == `en`);
246     assert(nt.objectDataTypeIRI is null);
247     assert(nt.objectFormat == ObjectFormat.literal);
248 }
249 
250 ///
251 @safe pure nothrow @nogc unittest
252 {
253     const x = `<http://dbpedia.org/resource/CT_Rei_Pel%C3%A9> <http://xmlns.com/foaf/0.1/homepage> <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9> .`;
254     auto nt = x.parseNTriple;
255     assert(nt.subjectFormat == SubjectFormat.IRI);
256     assert(nt.subject == `http://dbpedia.org/resource/CT_Rei_Pel%C3%A9`);
257     assert(nt.predicate == `http://xmlns.com/foaf/0.1/homepage`);
258     assert(nt.object == `http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9`);
259     assert(nt.objectFormat == ObjectFormat.IRI);
260 }