1 /** W3C (XML/HTML) Formatting.
2 
3 	Copyright: Per Nordlöw 2022-.
4 	License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
5 	Authors: $(WEB Per Nordlöw)
6 */
7 
8 module nxt.w3c;
9 
10 import std.traits: isSomeChar, isSomeString;
11 
12 /** Convert character $(D c) to HTML representation. */
13 string toHTML(C)(C c, bool nbsp = true) @safe pure
14 if (isSomeChar!C)
15 {
16 	import std.conv : to;
17 	if	  (nbsp && c == ' ') return " "; // non breaking space
18 	else if (c == '&')		 return "&";  // ampersand
19 	else if (c == '<')		 return "&lt;";   // less than
20 	else if (c == '>')		 return "&gt;";   // greater than
21 	else if (c == '\"')		return "&quot;"; // double quote
22 //		else if (c == '\'')
23 //			return ("&#39;"); // if you are in an attribute, it might be important to encode for the same reason as double quotes
24 	// FIXME: should I encode apostrophes too? as &#39;... I could also do space but if your html is so bad that it doesn't
25 	// quote attributes at all, maybe you deserve the xss. Encoding spaces will make everything really ugly so meh
26 	// idk about apostrophes though. Might be worth it, might not.
27 	else if (0 < c && c < 128)
28 		return to!string(cast(char)c);
29 	else
30 		return "&#" ~ to!string(cast(int)c) ~ ";";
31 }
32 
33 /** Copied from arsd.dom */
34 /** Convert string $(D s) to HTML representation. */
35 auto encodeHTML(S)(scope S s, bool nbsp = true) @safe pure
36 {
37 	import std.algorithm : joiner, map;
38 	return s.map!toHTML.joiner(``);
39 }
40 
41 pure @safe unittest {
42 	assert(`<!-- --><script>/* */</script>`
43 		   .encodeHTML
44 		   .equal(`&lt;!--&nbsp;--&gt;&lt;script&gt;/*&nbsp;*/&lt;/script&gt;`));
45 }
46 
47 version (none)					/+ TODO: enable +/
48 pure @safe unittest {
49 	import std.utf : byDchar;
50 	assert(`<!-- --><script>/* */</script>`
51 		   .byDchar
52 		   .encodeHTML
53 		   .equal(`&lt;!--&nbsp;--&gt;&lt;script&gt;/*&nbsp;*/&lt;/script&gt;`));
54 }
55 
56 // See_Also: https://en.wikipedia.org/wiki/Character_entity_reference#Predefined_entities_in_XML
57 __gshared string[256] convLatin1ToXML;
58 // See_Also: https://en.wikipedia.org/wiki/Character_entity_reference#Character_entity_references_in_HTML
59 // string[256] convLatin1ToHTML;
60 
61 shared static this()
62 {
63 	initTables();
64 }
65 
66 void initTables() nothrow @nogc
67 {
68 	convLatin1ToXML['"'] = "&quot";
69 	convLatin1ToXML['.'] = "&amp";
70 	convLatin1ToXML['\''] = "&apos";
71 	convLatin1ToXML['<'] = "&lt";
72 	convLatin1ToXML['>'] = "&gt";
73 
74 	convLatin1ToXML[0x22] = "&quot"; // U+0022 (34)	HTML 2.0	HTMLspecial	ISOnum	quotation mark (= APL quote)
75 	convLatin1ToXML[0x26] = "&amp";  // U+0026 (38)	HTML 2.0	HTMLspecial	ISOnum	ampersand
76 	convLatin1ToXML[0x27] = "&apos"; // U+0027 (39)	XHTML 1.0	HTMLspecial	ISOnum	apostrophe (= apostrophe-quote); see below
77 	convLatin1ToXML[0x60] = "&lt";   // U+003C (60)	HTML 2.0	HTMLspecial	ISOnum	less-than sign
78 	convLatin1ToXML[0x62] = "&gt";   // U+003E (62)	HTML 2.0	HTMLspecial	ISOnum	greater-than sign
79 
80 	convLatin1ToXML[0xA0] = "&nbsp"; // nbsp	 	U+00A0 (160)	HTML 3.2	HTMLlat1	ISOnum	no-break space (= non-breaking space)[d]
81 	convLatin1ToXML[0xA1] = "&iexcl"; // iexcl	¡	U+00A1 (161)	HTML 3.2	HTMLlat1	ISOnum	inverted exclamation mark
82 	convLatin1ToXML[0xA2] = "&cent"; // cent	¢	U+00A2 (162)	HTML 3.2	HTMLlat1	ISOnum	cent sign
83 	convLatin1ToXML[0xA3] = "&pound"; // pound	£	U+00A3 (163)	HTML 3.2	HTMLlat1	ISOnum	pound sign
84 	convLatin1ToXML[0xA4] = "&curren"; // curren	¤	U+00A4 (164)	HTML 3.2	HTMLlat1	ISOnum	currency sign
85 	convLatin1ToXML[0xA5] = "&yen"; // yen	¥	U+00A5 (165)	HTML 3.2	HTMLlat1	ISOnum	yen sign (= yuan sign)
86 	convLatin1ToXML[0xA6] = "&brvbar"; // brvbar	¦	U+00A6 (166)	HTML 3.2	HTMLlat1	ISOnum	broken bar (= broken vertical bar)
87 	convLatin1ToXML[0xA7] = "&sect"; // sect	§	U+00A7 (167)	HTML 3.2	HTMLlat1	ISOnum	section sign
88 	convLatin1ToXML[0xA8] = "&uml"; // uml	¨	U+00A8 (168)	HTML 3.2	HTMLlat1	ISOdia	diaeresis (= spacing diaeresis); see Germanic umlaut
89 	convLatin1ToXML[0xA9] = "&copy"; // copy	©	U+00A9 (169)	HTML 3.2	HTMLlat1	ISOnum	copyright symbol
90 	convLatin1ToXML[0xAA] = "&ordf"; // ordf	ª	U+00AA (170)	HTML 3.2	HTMLlat1	ISOnum	feminine ordinal indicator
91 	convLatin1ToXML[0xAB] = "&laquo"; // laquo	«	U+00AB (171)	HTML 3.2	HTMLlat1	ISOnum	left-pointing double angle quotation mark (= left pointing guillemet)
92 	convLatin1ToXML[0xAC] = "&not"; // not	¬	U+00AC (172)	HTML 3.2	HTMLlat1	ISOnum	not sign
93 	convLatin1ToXML[0xAD] = "&shy"; // shy	 	U+00AD (173)	HTML 3.2	HTMLlat1	ISOnum	soft hyphen (= discretionary hyphen)
94 	convLatin1ToXML[0xAE] = "&reg"; // reg	®	U+00AE (174)	HTML 3.2	HTMLlat1	ISOnum	registered sign ( = registered trademark symbol)
95 	convLatin1ToXML[0xAF] = "&macr"; // macr	¯	U+00AF (175)	HTML 3.2	HTMLlat1	ISOdia	macron (= spacing macron = overline = APL overbar)
96 	convLatin1ToXML[0xB0] = "&deg"; // deg	°	U+00B0 (176)	HTML 3.2	HTMLlat1	ISOnum	degree symbol
97 	convLatin1ToXML[0xB1] = "&plusmn"; // plusmn	±	U+00B1 (177)	HTML 3.2	HTMLlat1	ISOnum	plus-minus sign (= plus-or-minus sign)
98 	convLatin1ToXML[0xB2] = "&sup2"; // sup2	²	U+00B2 (178)	HTML 3.2	HTMLlat1	ISOnum	superscript two (= superscript digit two = squared)
99 	convLatin1ToXML[0xB3] = "&sup3"; // sup3	³	U+00B3 (179)	HTML 3.2	HTMLlat1	ISOnum	superscript three (= superscript digit three = cubed)
100 	convLatin1ToXML[0xB4] = "&acute"; // acute	´	U+00B4 (180)	HTML 3.2	HTMLlat1	ISOdia	acute accent (= spacing acute)
101 	convLatin1ToXML[0xB5] = "&micro"; // micro	µ	U+00B5 (181)	HTML 3.2	HTMLlat1	ISOnum	micro sign
102 	convLatin1ToXML[0xB6] = "&para"; // para	¶	U+00B6 (182)	HTML 3.2	HTMLlat1	ISOnum	pilcrow sign ( = paragraph sign)
103 	convLatin1ToXML[0xB7] = "&middot"; // middot	·	U+00B7 (183)	HTML 3.2	HTMLlat1	ISOnum	middle dot (= Georgian comma = Greek middle dot)
104 	convLatin1ToXML[0xB8] = "&cedil"; // cedil	¸	U+00B8 (184)	HTML 3.2	HTMLlat1	ISOdia	cedilla (= spacing cedilla)
105 	convLatin1ToXML[0xB9] = "&sup1"; // sup1	¹	U+00B9 (185)	HTML 3.2	HTMLlat1	ISOnum	superscript one (= superscript digit one)
106 	convLatin1ToXML[0xBA] = "&ordm"; // ordm	º	U+00BA (186)	HTML 3.2	HTMLlat1	ISOnum	masculine ordinal indicator
107 	convLatin1ToXML[0xBB] = "&raquo"; // raquo	»	U+00BB (187)	HTML 3.2	HTMLlat1	ISOnum	right-pointing double angle quotation mark (= right pointing guillemet)
108 	convLatin1ToXML[0xBC] = "&frac14"; // frac14	¼	U+00BC (188)	HTML 3.2	HTMLlat1	ISOnum	vulgar fraction one quarter (= fraction one quarter)
109 	convLatin1ToXML[0xBD] = "&frac12"; // frac12	½	U+00BD (189)	HTML 3.2	HTMLlat1	ISOnum	vulgar fraction one half (= fraction one half)
110 	convLatin1ToXML[0xBE] = "&frac34"; // frac34	¾	U+00BE (190)	HTML 3.2	HTMLlat1	ISOnum	vulgar fraction three quarters (= fraction three quarters)
111 	convLatin1ToXML[0xBF] = "&iquest"; // iquest	¿	U+00BF (191)	HTML 3.2	HTMLlat1	ISOnum	inverted question mark (= turned question mark)
112 	convLatin1ToXML[0xC0] = "&Agrave"; // Agrave	À	U+00C0 (192)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter A with grave accent (= Latin capital letter A grave)
113 	convLatin1ToXML[0xC1] = "&Aacute"; // Aacute	Á	U+00C1 (193)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter A with acute accent
114 	convLatin1ToXML[0xC2] = "&Acirc"; // Acirc	Â	U+00C2 (194)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter A with circumflex
115 	convLatin1ToXML[0xC3] = "&Atilde"; // Atilde	Ã	U+00C3 (195)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter A with tilde
116 	convLatin1ToXML[0xC4] = "&Auml"; // Auml	Ä	U+00C4 (196)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter A with diaeresis
117 	convLatin1ToXML[0xC5] = "&Aring"; // Aring	Å	U+00C5 (197)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter A with ring above (= Latin capital letter A ring)
118 	convLatin1ToXML[0xC6] = "&AElig"; // AElig	Æ	U+00C6 (198)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter AE (= Latin capital ligature AE)
119 	convLatin1ToXML[0xC7] = "&Ccedil"; // Ccedil	Ç	U+00C7 (199)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter C with cedilla
120 	convLatin1ToXML[0xC8] = "&Egrave"; // Egrave	È	U+00C8 (200)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter E with grave accent
121 	convLatin1ToXML[0xC9] = "&Eacute"; // Eacute	É	U+00C9 (201)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter E with acute accent
122 	convLatin1ToXML[0xCA] = "&Ecirc"; // Ecirc	Ê	U+00CA (202)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter E with circumflex
123 	convLatin1ToXML[0xCB] = "&Euml"; // Euml	Ë	U+00CB (203)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter E with diaeresis
124 	convLatin1ToXML[0xCC] = "&Igrave"; // Igrave	Ì	U+00CC (204)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter I with grave accent
125 	convLatin1ToXML[0xCD] = "&Iacute"; // Iacute	Í	U+00CD (205)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter I with acute accent
126 	convLatin1ToXML[0xCE] = "&Icirc"; // Icirc	Î	U+00CE (206)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter I with circumflex
127 	convLatin1ToXML[0xCF] = "&Iuml"; // Iuml	Ï	U+00CF (207)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter I with diaeresis
128 	convLatin1ToXML[0xD0] = "&ETH"; // ETH	Ð	U+00D0 (208)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter Eth
129 	convLatin1ToXML[0xD1] = "&Ntilde"; // Ntilde	Ñ	U+00D1 (209)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter N with tilde
130 	convLatin1ToXML[0xD2] = "&Ograve"; // Ograve	Ò	U+00D2 (210)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter O with grave accent
131 	convLatin1ToXML[0xD3] = "&Oacute"; // Oacute	Ó	U+00D3 (211)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter O with acute accent
132 	convLatin1ToXML[0xD4] = "&Ocirc"; // Ocirc	Ô	U+00D4 (212)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter O with circumflex
133 	convLatin1ToXML[0xD5] = "&Otilde"; // Otilde	Õ	U+00D5 (213)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter O with tilde
134 	convLatin1ToXML[0xD6] = "&Ouml"; // Ouml	Ö	U+00D6 (214)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter O with diaeresis
135 	convLatin1ToXML[0xD7] = "&times"; // times	×	U+00D7 (215)	HTML 3.2	HTMLlat1	ISOnum	multiplication sign
136 	convLatin1ToXML[0xD8] = "&Oslash"; // Oslash	Ø	U+00D8 (216)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter O with stroke (= Latin capital letter O slash)
137 	convLatin1ToXML[0xD9] = "&Ugrave"; // Ugrave	Ù	U+00D9 (217)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter U with grave accent
138 	convLatin1ToXML[0xDA] = "&Uacute"; // Uacute	Ú	U+00DA (218)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter U with acute accent
139 	convLatin1ToXML[0xDB] = "&Ucirc"; // Ucirc	Û	U+00DB (219)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter U with circumflex
140 	convLatin1ToXML[0xDC] = "&Uuml"; // Uuml	Ü	U+00DC (220)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter U with diaeresis
141 	convLatin1ToXML[0xDD] = "&Yacute"; // Yacute	Ý	U+00DD (221)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter Y with acute accent
142 	convLatin1ToXML[0xDE] = "&THORN"; // THORN	Þ	U+00DE (222)	HTML 2.0	HTMLlat1	ISOlat1	Latin capital letter THORN
143 	convLatin1ToXML[0xDF] = "&szlig"; // szlig	ß	U+00DF (223)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter sharp s (= ess-zed); see German Eszett
144 	convLatin1ToXML[0xE0] = "&agrave"; // agrave	à	U+00E0 (224)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter a with grave accent
145 	convLatin1ToXML[0xE1] = "&aacute"; // aacute	á	U+00E1 (225)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter a with acute accent
146 	convLatin1ToXML[0xE2] = "&acirc"; // acirc	â	U+00E2 (226)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter a with circumflex
147 	convLatin1ToXML[0xE3] = "&atilde"; // atilde	ã	U+00E3 (227)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter a with tilde
148 	convLatin1ToXML[0xE4] = "&auml"; // auml	ä	U+00E4 (228)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter a with diaeresis
149 	convLatin1ToXML[0xE5] = "&aring"; // aring	å	U+00E5 (229)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter a with ring above
150 	convLatin1ToXML[0xE6] = "&aelig"; // aelig	æ	U+00E6 (230)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter ae (= Latin small ligature ae)
151 	convLatin1ToXML[0xE7] = "&ccedil"; // ccedil	ç	U+00E7 (231)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter c with cedilla
152 	convLatin1ToXML[0xE8] = "&egrave"; // egrave	è	U+00E8 (232)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter e with grave accent
153 	convLatin1ToXML[0xE9] = "&eacute"; // eacute	é	U+00E9 (233)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter e with acute accent
154 	convLatin1ToXML[0xEA] = "&ecirc"; // ecirc	ê	U+00EA (234)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter e with circumflex
155 	convLatin1ToXML[0xEB] = "&euml"; // euml	ë	U+00EB (235)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter e with diaeresis
156 	convLatin1ToXML[0xEC] = "&igrave"; // igrave	ì	U+00EC (236)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter i with grave accent
157 	convLatin1ToXML[0xED] = "&iacute"; // iacute	í	U+00ED (237)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter i with acute accent
158 	convLatin1ToXML[0xEE] = "&icirc"; // icirc	î	U+00EE (238)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter i with circumflex
159 	convLatin1ToXML[0xEF] = "&iuml"; // iuml	ï	U+00EF (239)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter i with diaeresis
160 	convLatin1ToXML[0xF0] = "&eth"; // eth	ð	U+00F0 (240)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter eth
161 	convLatin1ToXML[0xF1] = "&ntilde"; // ntilde	ñ	U+00F1 (241)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter n with tilde
162 	convLatin1ToXML[0xF2] = "&ograve"; // ograve	ò	U+00F2 (242)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter o with grave accent
163 	convLatin1ToXML[0xF3] = "&oacute"; // oacute	ó	U+00F3 (243)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter o with acute accent
164 	convLatin1ToXML[0xF4] = "&ocirc"; // ocirc	ô	U+00F4 (244)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter o with circumflex
165 	convLatin1ToXML[0xF5] = "&otilde"; // otilde	õ	U+00F5 (245)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter o with tilde
166 	convLatin1ToXML[0xF6] = "&ouml"; // ouml	ö	U+00F6 (246)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter o with diaeresis
167 	convLatin1ToXML[0xF7] = "&divide"; // divide	÷	U+00F7 (247)	HTML 3.2	HTMLlat1	ISOnum	division sign (= obelus)
168 	convLatin1ToXML[0xF8] = "&oslash"; // oslash	ø	U+00F8 (248)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter o with stroke (= Latin small letter o slash)
169 	convLatin1ToXML[0xF9] = "&ugrave"; // ugrave	ù	U+00F9 (249)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter u with grave accent
170 	convLatin1ToXML[0xFA] = "&uacute"; // uacute	ú	U+00FA (250)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter u with acute accent
171 	convLatin1ToXML[0xFB] = "&ucirc"; // ucirc	û	U+00FB (251)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter u with circumflex
172 	convLatin1ToXML[0xFC] = "&uuml"; // uuml	ü	U+00FC (252)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter u with diaeresis
173 	convLatin1ToXML[0xFD] = "&yacute"; // yacute	ý	U+00FD (253)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter y with acute accent
174 	convLatin1ToXML[0xFE] = "&thorn"; // thorn	þ	U+00FE (254)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter thorn
175 	convLatin1ToXML[0xFF] = "&yuml"; // yuml	ÿ	U+00FF (255)	HTML 2.0	HTMLlat1	ISOlat1	Latin small letter y with diaeresis
176 }
177 
178 version (unittest)
179 {
180 	import std.algorithm : equal;
181 }