1 /** RDF-data model and algorithsm. 2 * 3 * Currently supports N-Triples (.nt). 4 * 5 * Planned support for RDF Turtle (.ttl) statements (either single-line or multi-line). 6 * 7 * TODO can we make inout operator only on the members of the returned `NTriple` in `parseNTriple`? 8 * TODO parse Turtle .ttl-files (https://en.wikipedia.org/wiki/Turtle_(syntax)) 9 * TODO parse N-Quads for use in Wikidata 10 * TODO parse RDF/XML 11 * 12 * See_Also: https://en.wikipedia.org/wiki/Resource_Description_Framework 13 * See_Also: https://en.wikipedia.org/wiki/Turtle_(syntax) 14 * See_Also: https://en.wikipedia.org/wiki/N-Triples#N-Quads 15 * 16 * See_Also: https://www.ida.liu.se/~robke04/include/publications.shtml 17 * 18 * TODO decode `subject` and `object` (in `Db.exprURI`) only when their types are IRIs? 19 */ 20 module nxt.rdf; 21 22 enum SubjectFormat 23 { 24 IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier 25 blankNode 26 } 27 enum ObjectFormat 28 { 29 IRI, // See_Also: https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier 30 blankNode, 31 literal 32 } 33 34 @safe pure nothrow @nogc: 35 36 /** RDF N-Triple (data model). 37 * 38 * Parameterized on element type $(D Chars). Use NTriple!(char[]) to avoid 39 * GC-allocations when parsing files using File.byLine which returns a volatile 40 * reference to a temporary char[] buffer. If The NTriples are to be stored 41 * permanently in memory use NTriple!string. 42 * 43 * See_Also: https://en.wikipedia.org/wiki/N-Triples 44 */ 45 struct NTriple 46 { 47 import nxt.array_algorithm : skipOver, skipOverBack, startsWith, endsWith; 48 49 alias Chars = const(char)[]; 50 51 /** Parse `subject`, `predicate` and `object`. 52 * 53 * Fails for: 54 * - subject: <http://dbpedia.org/resource/CT_Rei_Pel%C3%A9> 55 * - predicate: <http://xmlns.com/foaf/0.1/homepage> 56 * - object: <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9> 57 */ 58 void parse() @safe pure scope nothrow @nogc 59 { 60 // subject: Standard: https://www.w3.org/TR/n-triples/#grammar-production-subject 61 if (subject.skipOver('<')) // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF) 62 { 63 const ok = subject.skipOverBack('>'); 64 assert(ok); 65 subjectFormat = SubjectFormat.IRI; 66 } 67 else // BLANK_NODE_LABEL 68 { 69 subjectFormat = SubjectFormat.blankNode; 70 } 71 72 // predicate: Standard: https://www.w3.org/TR/n-triples/#grammar-production-predicate 73 assert(predicate.startsWith('<')); 74 assert(predicate.endsWith('>')); 75 predicate = predicate[1 .. $ - 1]; // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF) 76 77 // object: Standard: https://www.w3.org/TR/n-triples/#grammar-production-object 78 if (object.skipOver('<')) // IRIREF (https://www.w3.org/TR/n-triples/#grammar-production-IRIREF) 79 { 80 const ok = object.skipOverBack('>'); 81 assert(ok); 82 objectFormat = ObjectFormat.IRI; 83 } 84 else if (object.skipOver('"')) // literal (https://www.w3.org/TR/n-triples/#grammar-production-literal) 85 { 86 // import std.ascii : isLower; 87 import nxt.array_algorithm : findSplit; 88 if (const split = object.findSplit(`"@`)) 89 { 90 objectLanguageCode = split.post; 91 object = split.pre; 92 } 93 else if (auto hit = object.findSplit(`"^^`)) 94 { 95 const objectdataType = hit.post; 96 assert(objectdataType.startsWith('<')); 97 assert(objectdataType.endsWith('>')); 98 objectDataTypeIRI = objectdataType[1 .. $ - 1]; 99 object = hit.pre; 100 } 101 else 102 { 103 const ok = object.skipOverBack('"'); 104 if (!ok) 105 { 106 assert("No matching double-quote in object "); 107 } 108 assert(ok); 109 } 110 111 // dbg(`object:"`, object, `" lang:"`, objectLanguageCode, `" typeIRI:"`, objectDataTypeIRI, `"`); 112 objectFormat = ObjectFormat.literal; 113 } 114 else // BLANK_NODE_LABEL (https://www.w3.org/TR/n-triples/#grammar-production-BLANK_NODE_LABEL) 115 { 116 objectFormat = ObjectFormat.blankNode; 117 } 118 } 119 120 Chars subject; 121 Chars predicate; 122 123 Chars object; 124 Chars objectLanguageCode; 125 Chars objectDataTypeIRI; 126 127 SubjectFormat subjectFormat; 128 ObjectFormat objectFormat; 129 } 130 131 /** Decode `line` into an RDF N-Triple. 132 * 133 * See_Also: https://www.w3.org/TR/n-triples/ 134 */ 135 auto parseNTriple(scope return inout(char)[] line) 136 { 137 debug const originalLine = line; 138 import nxt.array_algorithm : skipOverBack, indexOf; 139 140 debug assert(line.length >= 4, `Failed to parse too short: "` ~ originalLine ~ `"`); 141 142 // strip suffix 143 line.skipOverBack('.'); 144 line.skipOverBack(' '); 145 146 // subject IRI 147 const ix0 = line.indexOf(' '); // TODO use array_algorithm.findSplit(' ') 148 debug assert(ix0 != -1, `Failed to parse: "` ~ originalLine ~ `"`); 149 const subject = line[0 .. ix0]; 150 line = line[ix0 + 1 .. $]; 151 152 // predicate IRI 153 const ix1 = line.indexOf(' '); // TODO use array_algorithm.findSplit(' ') 154 debug assert(ix1 != -1, `Failed to parse: "` ~ originalLine ~ `"`); 155 const predicate = line[0 .. ix1]; 156 line = line[ix1 + 1 .. $]; 157 158 auto nt = inout(NTriple)(subject, predicate, line); 159 (cast(NTriple)nt).parse(); // hack to make `inout` work 160 return nt; 161 } 162 163 /// 164 @safe pure nothrow @nogc unittest 165 { 166 const x = `<http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)> <http://dbpedia.org/ontology/artist> <http://dbpedia.org/resource/Gerardo_Mej%C3%ADa> .`; 167 auto nt = x.parseNTriple; 168 static assert(is(typeof(nt.subject) == immutable(string))); // TODO should be `string` or `const(char)[]` 169 static assert(is(typeof(nt.predicate) == immutable(string))); // TODO should be `string` or `const(char)[]` 170 static assert(is(typeof(nt.object) == immutable(string))); // TODO should be `string` or `const(char)[]` 171 assert(nt.subject == `http://dbpedia.org/resource/180%C2%B0_(Gerardo_album)`); 172 assert(nt.subjectFormat == SubjectFormat.IRI); 173 assert(nt.predicate == `http://dbpedia.org/ontology/artist`); 174 assert(nt.object == `http://dbpedia.org/resource/Gerardo_Mej%C3%ADa`); 175 assert(nt.objectLanguageCode is null); 176 assert(nt.objectDataTypeIRI is null); 177 assert(nt.objectFormat == ObjectFormat.IRI); 178 } 179 180 /// 181 @safe pure nothrow @nogc unittest 182 { 183 const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup"@en .`; 184 const nt = x.parseNTriple; 185 assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`); 186 assert(nt.subjectFormat == SubjectFormat.IRI); 187 assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`); 188 assert(nt.object == `Chatham Cup`); 189 assert(nt.objectLanguageCode == `en`); 190 assert(nt.objectDataTypeIRI is null); 191 assert(nt.objectFormat == ObjectFormat.literal); 192 } 193 194 /// 195 @safe pure nothrow @nogc unittest 196 { 197 const x = `<http://dbpedia.org/resource/1950_Chatham_Cup> <http://xmlns.com/foaf/0.1/name> "Chatham Cup" .`; 198 const nt = x.parseNTriple; 199 assert(nt.subject == `http://dbpedia.org/resource/1950_Chatham_Cup`); 200 assert(nt.subjectFormat == SubjectFormat.IRI); 201 assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`); 202 assert(nt.object == `Chatham Cup`); 203 assert(nt.objectLanguageCode is null); 204 assert(nt.objectDataTypeIRI is null); 205 assert(nt.objectFormat == ObjectFormat.literal); 206 } 207 208 /// 209 @safe pure nothrow @nogc unittest 210 { 211 const x = `<http://dbpedia.org/resource/007:_Quantum_of_Solace> <http://dbpedia.org/ontology/releaseDate> "2008-10-31"^^<http://www.w3.org/2001/XMLSchema#date> .`; 212 const nt = x.parseNTriple; 213 assert(nt.subject == `http://dbpedia.org/resource/007:_Quantum_of_Solace`); 214 assert(nt.subjectFormat == SubjectFormat.IRI); 215 assert(nt.predicate == `http://dbpedia.org/ontology/releaseDate`); 216 assert(nt.object == `2008-10-31`); 217 assert(nt.objectLanguageCode is null); 218 assert(nt.objectDataTypeIRI == `http://www.w3.org/2001/XMLSchema#date`); 219 assert(nt.objectFormat == ObjectFormat.literal); 220 } 221 222 /// 223 @safe pure nothrow @nogc unittest 224 { 225 const x = `<http://dbpedia.org/resource/Ceremony_(song)> <http://dbpedia.org/ontology/bSide> "\"In a Lonely Place\"".`; 226 const nt = x.parseNTriple; 227 assert(nt.subject == `http://dbpedia.org/resource/Ceremony_(song)`); 228 assert(nt.subjectFormat == SubjectFormat.IRI); 229 assert(nt.predicate == `http://dbpedia.org/ontology/bSide`); 230 assert(nt.object == `\"In a Lonely Place\"`); // to be unescaped 231 assert(nt.objectLanguageCode is null); 232 assert(nt.objectDataTypeIRI is null); 233 assert(nt.objectFormat == ObjectFormat.literal); 234 } 235 236 /// 237 @safe pure nothrow @nogc unittest 238 { 239 const x = `<http://dbpedia.org/resource/16_@_War> <http://xmlns.com/foaf/0.1/name> "16 @ War"@en .`; 240 auto nt = x.parseNTriple; 241 assert(nt.subject == `http://dbpedia.org/resource/16_@_War`); 242 assert(nt.subjectFormat == SubjectFormat.IRI); 243 assert(nt.predicate == `http://xmlns.com/foaf/0.1/name`); 244 assert(nt.object == `16 @ War`); 245 assert(nt.objectLanguageCode == `en`); 246 assert(nt.objectDataTypeIRI is null); 247 assert(nt.objectFormat == ObjectFormat.literal); 248 } 249 250 /// 251 @safe pure nothrow @nogc unittest 252 { 253 const x = `<http://dbpedia.org/resource/CT_Rei_Pel%C3%A9> <http://xmlns.com/foaf/0.1/homepage> <http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9> .`; 254 auto nt = x.parseNTriple; 255 assert(nt.subjectFormat == SubjectFormat.IRI); 256 assert(nt.subject == `http://dbpedia.org/resource/CT_Rei_Pel%C3%A9`); 257 assert(nt.predicate == `http://xmlns.com/foaf/0.1/homepage`); 258 assert(nt.object == `http://www.santosfc.com.br/clube/default.asp?c=Sedes&st=CT%20Rei%20Pel%E9`); 259 assert(nt.objectFormat == ObjectFormat.IRI); 260 }