1 /** RDF-data model and algorithms. 2 * 3 * Currently supports N-Triples (.nt). 4 * 5 * Planned support for RDF Turtle (.ttl) statements (either single-line or multi-line). 6 * 7 * TODO: can we make inout operator only on the members of the returned `NTriple` in `parseNTriple`? 8 * TODO: parse Turtle .ttl-files (https://en.wikipedia.org/wiki/Turtle_(syntax)) 9 * TODO: parse N-Quads for use in Wikidata 10 * TODO: parse RDF/XML 11 * 12 * See_Also: https://en.wikipedia.org/wiki/Resource_Description_Framework 13 * See_Also: https://en.wikipedia.org/wiki/Turtle_(syntax) 14 * See_Also: https://en.wikipedia.org/wiki/N-Triples#N-Quads 15 * 16 * See_Also: https://www.ida.liu.se/~robke04/include/publications.shtml 17 * 18 * TODO: decode `subject` and `object` (in `Db.exprURI`) only when their types are IRIs? 19 */ 20 module nxt.rdf; 21 22 pure nothrow @safe @nogc: 23 24 /++ Subject format. +/ 25 enum SubjectFormat { 26 IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier 27 blankNode 28 } 29 30 /++ Object format. +/ 31 enum ObjectFormat { 32 IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier 33 blankNode, 34 literal 35 } 36 37 /** RDF N-Triple (data model). 38 * 39 * Parameterized on element type $(D Chars). Use NTriple!(char[]) to avoid 40 * GC-allocations when parsing files using File.byLine which returns a volatile 41 * reference to a temporary char[] buffer. If The NTriples are to be stored 42 * permanently in memory use NTriple!string. 43 * 44 * See_Also: https://en.wikipedia.org/wiki/N-Triples 45 */ 46 struct NTriple { 47 import nxt.algorithm.searching : skipOver, skipOverBack, startsWith, endsWith; 48 49 alias Chars = const(char)[]; 50 51 /** Parse `subject`, `predicate` and `object`. 52 * 53 * Fails for: 54 * - subject: <http://dbpedia.org/resource/CT_Rei_Pel%C3%A9> 55 * - predicate: <http://xmlns.com/foaf/0.1/homepage> 56 * - object: <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9> 57 */ 58 void parse() scope pure nothrow @safe @nogc { 59 // subject: Standard: https://www.w3.org/TR/n-triples/#grammar-production-subject 60 if (subject.skipOver('<')) { // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF) 61 const ok = subject.skipOverBack('>'); 62 assert(ok); 63 subjectFormat = SubjectFormat.IRI; 64 } else // BLANK_NODE_LABEL 65 subjectFormat = SubjectFormat.blankNode; 66 67 // predicate: Standard: https://www.w3.org/TR/n-triples/#grammar-production-predicate 68 assert(predicate.startsWith('<')); 69 assert(predicate.endsWith('>')); 70 predicate = predicate[1 .. $ - 1]; // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF) 71 72 // object: Standard: https://www.w3.org/TR/n-triples/#grammar-production-object 73 if (object.skipOver('<')) { // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF) 74 const ok = object.skipOverBack('>'); 75 assert(ok); 76 objectFormat = ObjectFormat.IRI; 77 } else if (object.skipOver('"')) { // literal (https://www.w3.org/TR/n-triples/#grammar-production-literal) 78 // import std.ascii : isLower; 79 import nxt.algorithm.searching : findSplit; 80 if (const split = object.findSplit(`"@`)) { 81 () @trusted { // TODO: -dip1000 without @trusted 82 objectLanguageCode = split.post; 83 object = split.pre; 84 }(); 85 } else if (auto hit = object.findSplit(`"^^`)) { 86 const objectdataType = hit.post; 87 assert(objectdataType.startsWith('<')); 88 assert(objectdataType.endsWith('>')); 89 () @trusted { // TODO: -dip1000 without @trusted 90 objectDataTypeIRI = objectdataType[1 .. $ - 1]; 91 object = hit.pre; 92 }(); 93 } else { 94 const ok = object.skipOverBack('"'); 95 if (!ok) 96 assert(0, "No matching double-quote in object "); 97 assert(ok); 98 } 99 100 // dbg(`object:"`, object, `" lang:"`, objectLanguageCode, `" typeIRI:"`, objectDataTypeIRI, `"`); 101 objectFormat = ObjectFormat.literal; 102 } else { // BLANK_NODE_LABEL (https://www.w3.org/TR/n-triples/#grammar-production-BLANK_NODE_LABEL) 103 objectFormat = ObjectFormat.blankNode; 104 } 105 } 106 107 Chars subject; 108 Chars predicate; 109 110 Chars object; 111 Chars objectLanguageCode; 112 Chars objectDataTypeIRI; 113 114 SubjectFormat subjectFormat; 115 ObjectFormat objectFormat; 116 } 117 118 /** Decode `line` into an RDF N-Triple. 119 * 120 * See_Also: https://www.w3.org/TR/n-triples/ 121 */ 122 auto parseNTriple(scope return inout(char)[] line) { 123 debug const originalLine = line; 124 import nxt.algorithm.searching : skipOverBack, indexOf; 125 126 debug assert(line.length >= 4, `Failed to parse too short: "` ~ originalLine ~ `"`); 127 128 // strip suffix 129 line.skipOverBack('.'); 130 line.skipOverBack(' '); 131 132 // subject IRI 133 const ix0 = line.indexOf(' '); /+ TODO: use algorithm.findSplit(' ') +/ 134 debug assert(ix0 != -1, `Failed to parse: "` ~ originalLine ~ `"`); 135 const subject = line[0 .. ix0]; 136 line = line[ix0 + 1 .. $]; 137 138 // predicate IRI 139 const ix1 = line.indexOf(' '); /+ TODO: use algorithm.findSplit(' ') +/ 140 debug assert(ix1 != -1, `Failed to parse: "` ~ originalLine ~ `"`); 141 const predicate = line[0 .. ix1]; 142 line = line[ix1 + 1 .. $]; 143 144 auto nt = inout(NTriple)(subject, predicate, line); 145 (cast(NTriple)nt).parse(); // hack to make `inout` work 146 return nt; 147 } 148 149 /// 150 pure nothrow @safe @nogc unittest { 151 const x = `<http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)> <http://dbpedia.org/ontology/artist> <http://dbpedia.org/resource/Gerardo_Mej%C3%ADa> .`; 152 auto nt = x.parseNTriple; 153 static assert(is(typeof(nt.subject) == immutable(string))); /+ TODO: should be `string` or `const(char)[]` +/ 154 static assert(is(typeof(nt.predicate) == immutable(string))); /+ TODO: should be `string` or `const(char)[]` +/ 155 static assert(is(typeof(nt.object) == immutable(string))); /+ TODO: should be `string` or `const(char)[]` +/ 156 assert(nt.subject == `http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)`); 157 assert(nt.subjectFormat == SubjectFormat.IRI); 158 assert(nt.predicate == `http://dbpedia.org/ontology/artist`); 159 assert(nt.object == `http://dbpedia.org/resource/Gerardo_Mej%C3%ADa`); 160 assert(nt.objectLanguageCode is null); 161 assert(nt.objectDataTypeIRI is null); 162 assert(nt.objectFormat == ObjectFormat.IRI); 163 } 164 165 /// 166 pure nothrow @safe @nogc unittest { 167 const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup"@en .`; 168 const nt = x.parseNTriple; 169 assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`); 170 assert(nt.subjectFormat == SubjectFormat.IRI); 171 assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`); 172 assert(nt.object == `Chatham Cup`); 173 assert(nt.objectLanguageCode == `en`); 174 assert(nt.objectDataTypeIRI is null); 175 assert(nt.objectFormat == ObjectFormat.literal); 176 } 177 178 /// 179 pure nothrow @safe @nogc unittest { 180 const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup" .`; 181 const nt = x.parseNTriple; 182 assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`); 183 assert(nt.subjectFormat == SubjectFormat.IRI); 184 assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`); 185 assert(nt.object == `Chatham Cup`); 186 assert(nt.objectLanguageCode is null); 187 assert(nt.objectDataTypeIRI is null); 188 assert(nt.objectFormat == ObjectFormat.literal); 189 } 190 191 /// 192 pure nothrow @safe @nogc unittest { 193 const x = `<http://dbpedia.org/resource/007:_Quantum_of_Solace> <http://dbpedia.org/ontology/releaseDate> "2008-10-31"^^<http://www.w3.org/2001/XMLSchema#date> .`; 194 const nt = x.parseNTriple; 195 assert(nt.subject == `http://dbpedia.org/resource/007:_Quantum_of_Solace`); 196 assert(nt.subjectFormat == SubjectFormat.IRI); 197 assert(nt.predicate == `http://dbpedia.org/ontology/releaseDate`); 198 assert(nt.object == `2008-10-31`); 199 assert(nt.objectLanguageCode is null); 200 assert(nt.objectDataTypeIRI == `http://www.w3.org/2001/XMLSchema#date`); 201 assert(nt.objectFormat == ObjectFormat.literal); 202 } 203 204 /// 205 pure nothrow @safe @nogc unittest { 206 const x = `<http://dbpedia.org/resource/Ceremony_(song)> <http://dbpedia.org/ontology/bSide> "\"In a Lonely Place\"".`; 207 const nt = x.parseNTriple; 208 assert(nt.subject == `http://dbpedia.org/resource/Ceremony_(song)`); 209 assert(nt.subjectFormat == SubjectFormat.IRI); 210 assert(nt.predicate == `http://dbpedia.org/ontology/bSide`); 211 assert(nt.object == `\"In a Lonely Place\"`); // to be unescaped 212 assert(nt.objectLanguageCode is null); 213 assert(nt.objectDataTypeIRI is null); 214 assert(nt.objectFormat == ObjectFormat.literal); 215 } 216 217 /// 218 pure nothrow @safe @nogc unittest { 219 const x = `<http://dbpedia.org/resource/16_@_War> <http://xmlns.com/foaf/0.1/name> "16 @ War"@en .`; 220 auto nt = x.parseNTriple; 221 assert(nt.subject == `http://dbpedia.org/resource/16_@_War`); 222 assert(nt.subjectFormat == SubjectFormat.IRI); 223 assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`); 224 assert(nt.object == `16 @ War`); 225 assert(nt.objectLanguageCode == `en`); 226 assert(nt.objectDataTypeIRI is null); 227 assert(nt.objectFormat == ObjectFormat.literal); 228 } 229 230 /// 231 pure nothrow @safe @nogc unittest { 232 const x = `<http://dbpedia.org/resource/CT_Rei_Pel%C3%A9> <http://xmlns.com/foaf/0.1/homepage> <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9> .`; 233 auto nt = x.parseNTriple; 234 assert(nt.subjectFormat == SubjectFormat.IRI); 235 assert(nt.subject == `http://dbpedia.org/resource/CT_Rei_Pel%C3%A9`); 236 assert(nt.predicate == `http://xmlns.com/foaf/0.1/homepage`); 237 assert(nt.object == `http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9`); 238 assert(nt.objectFormat == ObjectFormat.IRI); 239 }