1 /** W3C (XML/HTML) Formatting. 2 3 Copyright: Per Nordlöw 2022-. 4 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 5 Authors: $(WEB Per Nordlöw) 6 */ 7 8 module nxt.w3c; 9 10 import std.traits: isSomeChar, isSomeString; 11 12 /** Convert character $(D c) to HTML representation. */ 13 string toHTML(C)(C c, bool nbsp = true) @safe pure 14 if (isSomeChar!C) 15 { 16 import std.conv : to; 17 if (nbsp && c == ' ') return " "; // non breaking space 18 else if (c == '&') return "&"; // ampersand 19 else if (c == '<') return "<"; // less than 20 else if (c == '>') return ">"; // greater than 21 else if (c == '\"') return """; // double quote 22 // else if (c == '\'') 23 // return ("'"); // if you are in an attribute, it might be important to encode for the same reason as double quotes 24 // FIXME: should I encode apostrophes too? as '... I could also do space but if your html is so bad that it doesn't 25 // quote attributes at all, maybe you deserve the xss. Encoding spaces will make everything really ugly so meh 26 // idk about apostrophes though. Might be worth it, might not. 27 else if (0 < c && c < 128) 28 return to!string(cast(char)c); 29 else 30 return "&#" ~ to!string(cast(int)c) ~ ";"; 31 } 32 33 /** Copied from arsd.dom */ 34 /** Convert string $(D s) to HTML representation. */ 35 auto encodeHTML(S)(scope S s, bool nbsp = true) @safe pure 36 { 37 import std.algorithm : joiner, map; 38 return s.map!toHTML.joiner(``); 39 } 40 41 pure @safe unittest { 42 assert(`<!-- --><script>/* */</script>` 43 .encodeHTML 44 .equal(`<!-- --><script>/* */</script>`)); 45 } 46 47 version (none) /+ TODO: enable +/ 48 pure @safe unittest { 49 import std.utf : byDchar; 50 assert(`<!-- --><script>/* */</script>` 51 .byDchar 52 .encodeHTML 53 .equal(`<!-- --><script>/* */</script>`)); 54 } 55 56 // See_Also: https://en.wikipedia.org/wiki/Character_entity_reference#Predefined_entities_in_XML 57 __gshared string[256] convLatin1ToXML; 58 // See_Also: https://en.wikipedia.org/wiki/Character_entity_reference#Character_entity_references_in_HTML 59 // string[256] convLatin1ToHTML; 60 61 shared static this() 62 { 63 initTables(); 64 } 65 66 void initTables() nothrow @nogc 67 { 68 convLatin1ToXML['"'] = """; 69 convLatin1ToXML['.'] = "&"; 70 convLatin1ToXML['\''] = "&apos"; 71 convLatin1ToXML['<'] = "<"; 72 convLatin1ToXML['>'] = ">"; 73 74 convLatin1ToXML[0x22] = """; // U+0022 (34) HTML 2.0 HTMLspecial ISOnum quotation mark (= APL quote) 75 convLatin1ToXML[0x26] = "&"; // U+0026 (38) HTML 2.0 HTMLspecial ISOnum ampersand 76 convLatin1ToXML[0x27] = "&apos"; // U+0027 (39) XHTML 1.0 HTMLspecial ISOnum apostrophe (= apostrophe-quote); see below 77 convLatin1ToXML[0x60] = "<"; // U+003C (60) HTML 2.0 HTMLspecial ISOnum less-than sign 78 convLatin1ToXML[0x62] = ">"; // U+003E (62) HTML 2.0 HTMLspecial ISOnum greater-than sign 79 80 convLatin1ToXML[0xA0] = " "; // nbsp U+00A0 (160) HTML 3.2 HTMLlat1 ISOnum no-break space (= non-breaking space)[d] 81 convLatin1ToXML[0xA1] = "¡"; // iexcl ¡ U+00A1 (161) HTML 3.2 HTMLlat1 ISOnum inverted exclamation mark 82 convLatin1ToXML[0xA2] = "¢"; // cent ¢ U+00A2 (162) HTML 3.2 HTMLlat1 ISOnum cent sign 83 convLatin1ToXML[0xA3] = "£"; // pound £ U+00A3 (163) HTML 3.2 HTMLlat1 ISOnum pound sign 84 convLatin1ToXML[0xA4] = "¤"; // curren ¤ U+00A4 (164) HTML 3.2 HTMLlat1 ISOnum currency sign 85 convLatin1ToXML[0xA5] = "¥"; // yen ¥ U+00A5 (165) HTML 3.2 HTMLlat1 ISOnum yen sign (= yuan sign) 86 convLatin1ToXML[0xA6] = "¦"; // brvbar ¦ U+00A6 (166) HTML 3.2 HTMLlat1 ISOnum broken bar (= broken vertical bar) 87 convLatin1ToXML[0xA7] = "§"; // sect § U+00A7 (167) HTML 3.2 HTMLlat1 ISOnum section sign 88 convLatin1ToXML[0xA8] = "¨"; // uml ¨ U+00A8 (168) HTML 3.2 HTMLlat1 ISOdia diaeresis (= spacing diaeresis); see Germanic umlaut 89 convLatin1ToXML[0xA9] = "©"; // copy © U+00A9 (169) HTML 3.2 HTMLlat1 ISOnum copyright symbol 90 convLatin1ToXML[0xAA] = "ª"; // ordf ª U+00AA (170) HTML 3.2 HTMLlat1 ISOnum feminine ordinal indicator 91 convLatin1ToXML[0xAB] = "«"; // laquo « U+00AB (171) HTML 3.2 HTMLlat1 ISOnum left-pointing double angle quotation mark (= left pointing guillemet) 92 convLatin1ToXML[0xAC] = "¬"; // not ¬ U+00AC (172) HTML 3.2 HTMLlat1 ISOnum not sign 93 convLatin1ToXML[0xAD] = "­"; // shy U+00AD (173) HTML 3.2 HTMLlat1 ISOnum soft hyphen (= discretionary hyphen) 94 convLatin1ToXML[0xAE] = "®"; // reg ® U+00AE (174) HTML 3.2 HTMLlat1 ISOnum registered sign ( = registered trademark symbol) 95 convLatin1ToXML[0xAF] = "¯"; // macr ¯ U+00AF (175) HTML 3.2 HTMLlat1 ISOdia macron (= spacing macron = overline = APL overbar) 96 convLatin1ToXML[0xB0] = "°"; // deg ° U+00B0 (176) HTML 3.2 HTMLlat1 ISOnum degree symbol 97 convLatin1ToXML[0xB1] = "±"; // plusmn ± U+00B1 (177) HTML 3.2 HTMLlat1 ISOnum plus-minus sign (= plus-or-minus sign) 98 convLatin1ToXML[0xB2] = "²"; // sup2 ² U+00B2 (178) HTML 3.2 HTMLlat1 ISOnum superscript two (= superscript digit two = squared) 99 convLatin1ToXML[0xB3] = "³"; // sup3 ³ U+00B3 (179) HTML 3.2 HTMLlat1 ISOnum superscript three (= superscript digit three = cubed) 100 convLatin1ToXML[0xB4] = "´"; // acute ´ U+00B4 (180) HTML 3.2 HTMLlat1 ISOdia acute accent (= spacing acute) 101 convLatin1ToXML[0xB5] = "µ"; // micro µ U+00B5 (181) HTML 3.2 HTMLlat1 ISOnum micro sign 102 convLatin1ToXML[0xB6] = "¶"; // para ¶ U+00B6 (182) HTML 3.2 HTMLlat1 ISOnum pilcrow sign ( = paragraph sign) 103 convLatin1ToXML[0xB7] = "·"; // middot · U+00B7 (183) HTML 3.2 HTMLlat1 ISOnum middle dot (= Georgian comma = Greek middle dot) 104 convLatin1ToXML[0xB8] = "¸"; // cedil ¸ U+00B8 (184) HTML 3.2 HTMLlat1 ISOdia cedilla (= spacing cedilla) 105 convLatin1ToXML[0xB9] = "¹"; // sup1 ¹ U+00B9 (185) HTML 3.2 HTMLlat1 ISOnum superscript one (= superscript digit one) 106 convLatin1ToXML[0xBA] = "º"; // ordm º U+00BA (186) HTML 3.2 HTMLlat1 ISOnum masculine ordinal indicator 107 convLatin1ToXML[0xBB] = "»"; // raquo » U+00BB (187) HTML 3.2 HTMLlat1 ISOnum right-pointing double angle quotation mark (= right pointing guillemet) 108 convLatin1ToXML[0xBC] = "¼"; // frac14 ¼ U+00BC (188) HTML 3.2 HTMLlat1 ISOnum vulgar fraction one quarter (= fraction one quarter) 109 convLatin1ToXML[0xBD] = "½"; // frac12 ½ U+00BD (189) HTML 3.2 HTMLlat1 ISOnum vulgar fraction one half (= fraction one half) 110 convLatin1ToXML[0xBE] = "¾"; // frac34 ¾ U+00BE (190) HTML 3.2 HTMLlat1 ISOnum vulgar fraction three quarters (= fraction three quarters) 111 convLatin1ToXML[0xBF] = "¿"; // iquest ¿ U+00BF (191) HTML 3.2 HTMLlat1 ISOnum inverted question mark (= turned question mark) 112 convLatin1ToXML[0xC0] = "À"; // Agrave À U+00C0 (192) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter A with grave accent (= Latin capital letter A grave) 113 convLatin1ToXML[0xC1] = "Á"; // Aacute Á U+00C1 (193) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter A with acute accent 114 convLatin1ToXML[0xC2] = "Â"; // Acirc  U+00C2 (194) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter A with circumflex 115 convLatin1ToXML[0xC3] = "Ã"; // Atilde à U+00C3 (195) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter A with tilde 116 convLatin1ToXML[0xC4] = "Ä"; // Auml Ä U+00C4 (196) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter A with diaeresis 117 convLatin1ToXML[0xC5] = "Å"; // Aring Å U+00C5 (197) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter A with ring above (= Latin capital letter A ring) 118 convLatin1ToXML[0xC6] = "Æ"; // AElig Æ U+00C6 (198) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter AE (= Latin capital ligature AE) 119 convLatin1ToXML[0xC7] = "Ç"; // Ccedil Ç U+00C7 (199) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter C with cedilla 120 convLatin1ToXML[0xC8] = "È"; // Egrave È U+00C8 (200) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter E with grave accent 121 convLatin1ToXML[0xC9] = "É"; // Eacute É U+00C9 (201) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter E with acute accent 122 convLatin1ToXML[0xCA] = "Ê"; // Ecirc Ê U+00CA (202) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter E with circumflex 123 convLatin1ToXML[0xCB] = "Ë"; // Euml Ë U+00CB (203) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter E with diaeresis 124 convLatin1ToXML[0xCC] = "Ì"; // Igrave Ì U+00CC (204) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter I with grave accent 125 convLatin1ToXML[0xCD] = "Í"; // Iacute Í U+00CD (205) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter I with acute accent 126 convLatin1ToXML[0xCE] = "Î"; // Icirc Î U+00CE (206) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter I with circumflex 127 convLatin1ToXML[0xCF] = "Ï"; // Iuml Ï U+00CF (207) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter I with diaeresis 128 convLatin1ToXML[0xD0] = "Ð"; // ETH Ð U+00D0 (208) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter Eth 129 convLatin1ToXML[0xD1] = "Ñ"; // Ntilde Ñ U+00D1 (209) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter N with tilde 130 convLatin1ToXML[0xD2] = "Ò"; // Ograve Ò U+00D2 (210) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter O with grave accent 131 convLatin1ToXML[0xD3] = "Ó"; // Oacute Ó U+00D3 (211) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter O with acute accent 132 convLatin1ToXML[0xD4] = "Ô"; // Ocirc Ô U+00D4 (212) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter O with circumflex 133 convLatin1ToXML[0xD5] = "Õ"; // Otilde Õ U+00D5 (213) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter O with tilde 134 convLatin1ToXML[0xD6] = "Ö"; // Ouml Ö U+00D6 (214) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter O with diaeresis 135 convLatin1ToXML[0xD7] = "×"; // times × U+00D7 (215) HTML 3.2 HTMLlat1 ISOnum multiplication sign 136 convLatin1ToXML[0xD8] = "Ø"; // Oslash Ø U+00D8 (216) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter O with stroke (= Latin capital letter O slash) 137 convLatin1ToXML[0xD9] = "Ù"; // Ugrave Ù U+00D9 (217) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter U with grave accent 138 convLatin1ToXML[0xDA] = "Ú"; // Uacute Ú U+00DA (218) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter U with acute accent 139 convLatin1ToXML[0xDB] = "Û"; // Ucirc Û U+00DB (219) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter U with circumflex 140 convLatin1ToXML[0xDC] = "Ü"; // Uuml Ü U+00DC (220) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter U with diaeresis 141 convLatin1ToXML[0xDD] = "Ý"; // Yacute Ý U+00DD (221) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter Y with acute accent 142 convLatin1ToXML[0xDE] = "Þ"; // THORN Þ U+00DE (222) HTML 2.0 HTMLlat1 ISOlat1 Latin capital letter THORN 143 convLatin1ToXML[0xDF] = "ß"; // szlig ß U+00DF (223) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter sharp s (= ess-zed); see German Eszett 144 convLatin1ToXML[0xE0] = "à"; // agrave à U+00E0 (224) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter a with grave accent 145 convLatin1ToXML[0xE1] = "á"; // aacute á U+00E1 (225) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter a with acute accent 146 convLatin1ToXML[0xE2] = "â"; // acirc â U+00E2 (226) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter a with circumflex 147 convLatin1ToXML[0xE3] = "ã"; // atilde ã U+00E3 (227) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter a with tilde 148 convLatin1ToXML[0xE4] = "ä"; // auml ä U+00E4 (228) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter a with diaeresis 149 convLatin1ToXML[0xE5] = "å"; // aring å U+00E5 (229) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter a with ring above 150 convLatin1ToXML[0xE6] = "æ"; // aelig æ U+00E6 (230) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter ae (= Latin small ligature ae) 151 convLatin1ToXML[0xE7] = "ç"; // ccedil ç U+00E7 (231) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter c with cedilla 152 convLatin1ToXML[0xE8] = "è"; // egrave è U+00E8 (232) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter e with grave accent 153 convLatin1ToXML[0xE9] = "é"; // eacute é U+00E9 (233) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter e with acute accent 154 convLatin1ToXML[0xEA] = "ê"; // ecirc ê U+00EA (234) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter e with circumflex 155 convLatin1ToXML[0xEB] = "ë"; // euml ë U+00EB (235) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter e with diaeresis 156 convLatin1ToXML[0xEC] = "ì"; // igrave ì U+00EC (236) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter i with grave accent 157 convLatin1ToXML[0xED] = "í"; // iacute í U+00ED (237) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter i with acute accent 158 convLatin1ToXML[0xEE] = "î"; // icirc î U+00EE (238) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter i with circumflex 159 convLatin1ToXML[0xEF] = "ï"; // iuml ï U+00EF (239) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter i with diaeresis 160 convLatin1ToXML[0xF0] = "ð"; // eth ð U+00F0 (240) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter eth 161 convLatin1ToXML[0xF1] = "ñ"; // ntilde ñ U+00F1 (241) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter n with tilde 162 convLatin1ToXML[0xF2] = "ò"; // ograve ò U+00F2 (242) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter o with grave accent 163 convLatin1ToXML[0xF3] = "ó"; // oacute ó U+00F3 (243) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter o with acute accent 164 convLatin1ToXML[0xF4] = "ô"; // ocirc ô U+00F4 (244) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter o with circumflex 165 convLatin1ToXML[0xF5] = "õ"; // otilde õ U+00F5 (245) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter o with tilde 166 convLatin1ToXML[0xF6] = "ö"; // ouml ö U+00F6 (246) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter o with diaeresis 167 convLatin1ToXML[0xF7] = "÷"; // divide ÷ U+00F7 (247) HTML 3.2 HTMLlat1 ISOnum division sign (= obelus) 168 convLatin1ToXML[0xF8] = "ø"; // oslash ø U+00F8 (248) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter o with stroke (= Latin small letter o slash) 169 convLatin1ToXML[0xF9] = "ù"; // ugrave ù U+00F9 (249) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter u with grave accent 170 convLatin1ToXML[0xFA] = "ú"; // uacute ú U+00FA (250) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter u with acute accent 171 convLatin1ToXML[0xFB] = "û"; // ucirc û U+00FB (251) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter u with circumflex 172 convLatin1ToXML[0xFC] = "ü"; // uuml ü U+00FC (252) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter u with diaeresis 173 convLatin1ToXML[0xFD] = "ý"; // yacute ý U+00FD (253) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter y with acute accent 174 convLatin1ToXML[0xFE] = "þ"; // thorn þ U+00FE (254) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter thorn 175 convLatin1ToXML[0xFF] = "ÿ"; // yuml ÿ U+00FF (255) HTML 2.0 HTMLlat1 ISOlat1 Latin small letter y with diaeresis 176 } 177 178 version (unittest) 179 { 180 import std.algorithm : equal; 181 }