diff --git a/src/main/java/mpi/tools/javatools/parsers/Char17.java b/src/main/java/mpi/tools/javatools/parsers/Char17.java
new file mode 100644
index 0000000..34f9dd2
--- /dev/null
+++ b/src/main/java/mpi/tools/javatools/parsers/Char17.java
@@ -0,0 +1,1063 @@
+package mpi.tools.javatools.parsers;
+
+import java.util.Arrays;
+
+import mpi.tools.javatools.datatypes.IntHashMap;
+import mpi.tools.javatools.datatypes.IntKeyMap;
+
+/**
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License
+(see http://creativecommons.org/licenses/by/3.0) by
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+
+This class is the improved version of the original Char class,
+which takes into account that Java 1.7 performs a copy for substring()
+
+This class provides static methods to decode, encode and normalize Strings.
+Decoding converts the following codes to Java 16-bit characters (char):
+
+ - all HTML ampersand codes (like ) as specified by the W3C
+
- all backslash codes (like \ b) as specified by the Java language specification
+
- all percentage codes (like %2C) as used in URLs and E-Mails
+
- all UTF-8 codes (like ī) as specified in Wikipedia
+
+
+Encoding is the inverse operation. It takes a Java 16-bit character (char) and
+outputs its encoding in HTML, as a backslash code, as a percentage code or in UTF8.
+
+Normalization converts the following Unicode characters (Java 16-bit chars)
+to ASCII-characters in the range 0x20-0x7F:
+
+ - all ASCII control characters (0x00-0x1F)
+
- all Latin-1 characters (0x80-0xFF) to the closest transliteration
+
- all Latin Extended-A characters (0x100-0x17F) to the closest transliteration
+
- all Greek characters (0x374-0x3D6) to the closest transliteration as specified in Wikipedia
+
- all General-Punctuation characters (0x2016-0x2055) to the closest ASCII punctuation
+
- most mathematical symbols (in the range of 0x2000) to the common program code identifier or text
+
- all ligatures (0xFB00-0xFB06, the nasty things you get when you copy/paste from PDFs) to
+ the separate characters
+
+ */
+public class Char17 {
+
+ /** String returned by the default implementation of defaultNormalizer, "[?]"*/
+ public static String UNKNOWN = "[?]";
+
+ /** Maps a special character to a HTML ampersand sequence */
+ public static IntKeyMap charToAmpersand = new IntKeyMap(
+ '&', "&", '\'', "'", '<', "<", '>', ">", '"',
+ """);
+
+ /** Maps a special character to a backslash sequence */
+ public static IntKeyMap charToBackslash = new IntKeyMap(
+ '\\', "\\\\", '\n', "\\n");
+
+ /** Maps HTML ampersand sequences to strings */
+ public static IntHashMap ampersandMap = new IntHashMap(
+ Arrays.asList((Object) "nbsp", (char) 160, "iexcl", (char) 161,
+ "cent", (char) 162, "pound", (char) 163, "curren",
+ (char) 164, "yen", (char) 165, "brvbar", (char) 166,
+ "sect", (char) 167, "uml", (char) 168, "copy", (char) 169,
+ "ordf", (char) 170, "laquo", (char) 171, "not", (char) 172,
+ "shy", (char) 173, "reg", (char) 174, "macr", (char) 175,
+ "deg", (char) 176, "plusmn", (char) 177, "sup2",
+ (char) 178, "sup3", (char) 179, "acute", (char) 180,
+ "micro", (char) 181, "para", (char) 182, "middot",
+ (char) 183, "cedil", (char) 184, "sup1", (char) 185,
+ "ordm", (char) 186, "raquo", (char) 187, "frac14",
+ (char) 188, "frac12", (char) 189, "frac34", (char) 190,
+ "iquest", (char) 191, "Agrave", (char) 192, "Aacute",
+ (char) 193, "Acirc", (char) 194, "Atilde", (char) 195,
+ "Auml", (char) 196, "Aring", (char) 197, "AElig",
+ (char) 198, "Ccedil", (char) 199, "Egrave", (char) 200,
+ "Eacute", (char) 201, "Ecirc", (char) 202, "Euml",
+ (char) 203, "Igrave", (char) 204, "Iacute", (char) 205,
+ "Icirc", (char) 206, "Iuml", (char) 207, "ETH", (char) 208,
+ "Ntilde", (char) 209, "Ograve", (char) 210, "Oacute",
+ (char) 211, "Ocirc", (char) 212, "Otilde", (char) 213,
+ "Ouml", (char) 214, "times", (char) 215, "Oslash",
+ (char) 216, "Ugrave", (char) 217, "Uacute", (char) 218,
+ "Ucirc", (char) 219, "Uuml", (char) 220, "Yacute",
+ (char) 221, "THORN", (char) 222, "szlig", (char) 223,
+ "agrave", (char) 224, "aacute", (char) 225, "acirc",
+ (char) 226, "atilde", (char) 227, "auml", (char) 228,
+ "aring", (char) 229, "aelig", (char) 230, "ccedil",
+ (char) 231, "egrave", (char) 232, "eacute", (char) 233,
+ "ecirc", (char) 234, "euml", (char) 235, "igrave",
+ (char) 236, "iacute", (char) 237, "icirc", (char) 238,
+ "iuml", (char) 239, "eth", (char) 240, "ntilde",
+ (char) 241, "ograve", (char) 242, "oacute", (char) 243,
+ "ocirc", (char) 244, "otilde", (char) 245, "ouml",
+ (char) 246, "divide", (char) 247, "oslash", (char) 248,
+ "ugrave", (char) 249, "uacute", (char) 250, "ucirc",
+ (char) 251, "uuml", (char) 252, "yacute", (char) 253,
+ "thorn", (char) 254, "yuml", (char) 255, "fnof",
+ (char) 402, "Alpha", (char) 913, "Beta", (char) 914,
+ "Gamma", (char) 915, "Delta", (char) 916, "Epsilon",
+ (char) 917, "Zeta", (char) 918, "Eta", (char) 919, "Theta",
+ (char) 920, "Iota", (char) 921, "Kappa", (char) 922,
+ "Lambda", (char) 923, "Mu", (char) 924, "Nu", (char) 925,
+ "Xi", (char) 926, "Omicron", (char) 927, "Pi", (char) 928,
+ "Rho", (char) 929, "Sigma", (char) 931, "Tau", (char) 932,
+ "Upsilon", (char) 933, "Phi", (char) 934, "Chi",
+ (char) 935, "Psi", (char) 936, "Omega", (char) 937,
+ "alpha", (char) 945, "beta", (char) 946, "gamma",
+ (char) 947, "delta", (char) 948, "epsilon", (char) 949,
+ "zeta", (char) 950, "eta", (char) 951, "theta", (char) 952,
+ "iota", (char) 953, "kappa", (char) 954, "lambda",
+ (char) 955, "mu", (char) 956, "nu", (char) 957, "xi",
+ (char) 958, "omicron", (char) 959, "pi", (char) 960, "rho",
+ (char) 961, "sigmaf", (char) 962, "sigma", (char) 963,
+ "tau", (char) 964, "upsilon", (char) 965, "phi",
+ (char) 966, "chi", (char) 967, "psi", (char) 968, "omega",
+ (char) 969, "thetasym", (char) 977, "upsih", (char) 978,
+ "piv", (char) 982, "bull", (char) 8226, "hellip",
+ (char) 8230, "prime", (char) 8242, "Prime", (char) 8243,
+ "oline", (char) 8254, "frasl", (char) 8260, "weierp",
+ (char) 8472, "image", (char) 8465, "real", (char) 8476,
+ "trade", (char) 8482, "alefsym", (char) 8501, "larr",
+ (char) 8592, "uarr", (char) 8593, "rarr", (char) 8594,
+ "darr", (char) 8595, "harr", (char) 8596, "crarr",
+ (char) 8629, "lArr", (char) 8656, "uArr", (char) 8657,
+ "rArr", (char) 8658, "dArr", (char) 8659, "hArr",
+ (char) 8660, "forall", (char) 8704, "part", (char) 8706,
+ "exist", (char) 8707, "empty", (char) 8709, "nabla",
+ (char) 8711, "isin", (char) 8712, "notin", (char) 8713,
+ "ni", (char) 8715, "prod", (char) 8719, "sum", (char) 8721,
+ "minus", (char) 8722, "lowast", (char) 8727, "radic",
+ (char) 8730, "prop", (char) 8733, "infin", (char) 8734,
+ "ang", (char) 8736, "and", (char) 8743, "or", (char) 8744,
+ "cap", (char) 8745, "cup", (char) 8746, "int", (char) 8747,
+ "there4", (char) 8756, "sim", (char) 8764, "cong",
+ (char) 8773, "asymp", (char) 8776, "ne", (char) 8800,
+ "equiv", (char) 8801, "le", (char) 8804, "ge", (char) 8805,
+ "sub", (char) 8834, "sup", (char) 8835, "nsub",
+ (char) 8836, "sube", (char) 8838, "supe", (char) 8839,
+ "oplus", (char) 8853, "otimes", (char) 8855, "perp",
+ (char) 8869, "sdot", (char) 8901, "lceil", (char) 8968,
+ "rceil", (char) 8969, "lfloor", (char) 8970, "rfloor",
+ (char) 8971, "lang", (char) 9001, "rang", (char) 9002,
+ "loz", (char) 9674, "spades", (char) 9824, "clubs",
+ (char) 9827, "hearts", (char) 9829, "diams", (char) 9830,
+ "quot", (char) 34, "amp", (char) 38, "lt", (char) 60, "gt",
+ (char) 62, "OElig", (char) 338, "oelig", (char) 339,
+ "Scaron", (char) 352, "scaron", (char) 353, "Yuml",
+ (char) 376, "circ", (char) 710, "tilde", (char) 732,
+ "ensp", (char) 8194, "emsp", (char) 8195, "thinsp",
+ (char) 8201, "zwnj", (char) 8204, "zwj",
+ (char) 8205,
+ "lrm",
+ (char) 8206,
+ "rlm",
+ (char) 8207,
+ "ndash",
+ (char) 8211, // 0x2013
+ "mdash", (char) 8212, "lsquo", (char) 8216, "rsquo",
+ (char) 8217, "sbquo", (char) 8218, "ldquo", (char) 8220,
+ "rdquo", (char) 8221, "bdquo", (char) 8222, "dagger",
+ (char) 8224, "Dagger", (char) 8225, "permil", (char) 8240,
+ "lsaquo", (char) 8249, "rsaquo", (char) 8250, "euro",
+ (char) 8364, "apos", '\''));
+
+ /** Maps characters to normalizations */
+ public static IntKeyMap normalizeMap = new IntKeyMap(
+ // ASCII
+ (char) 7,
+ "BEEP",
+ (char) 9,
+ " ",
+ (char) 10,
+ "\n",
+
+ // Latin-1
+ (char) 160,
+ " ",
+ (char) 161,
+ "!",
+ (char) 162,
+ "cent",
+ (char) 163,
+ "pound",
+ (char) 164,
+ "currency",
+ (char) 165,
+ "yen",
+ (char) 166,
+ "|",
+ (char) 167,
+ "/",
+ (char) 169,
+ "(c)",
+ (char) 170,
+ "^a",
+ (char) 171,
+ "\"",
+ (char) 172,
+ "~",
+ (char) 173,
+ "",
+ (char) 174,
+ "(R)",
+ (char) 176,
+ "degree",
+ (char) 177,
+ "+/-",
+ (char) 178,
+ "^2",
+ (char) 179,
+ "^3",
+ (char) 180,
+ "'",
+ (char) 181,
+ "mu",
+ (char) 182,
+ "P",
+ (char) 183,
+ ".",
+ (char) 184,
+ ",",
+ (char) 185,
+ "^1",
+ (char) 186,
+ "^o",
+ (char) 187,
+ "\"",
+ (char) 188,
+ "1/4",
+ (char) 189,
+ "1/2",
+ (char) 190,
+ "3/4",
+ (char) 191,
+ "?",
+ (char) 0xC4,
+ "Ae",
+ (char) 0xD6,
+ "Oe",
+ (char) 0xDC,
+ "Ue",
+ (char) 0xDF,
+ "ss",
+ (char) 0xC6,
+ "Ae",
+ (char) 0xC7,
+ "C",
+ (char) 0xD0,
+ "D",
+ (char) 0xD1,
+ "N",
+ (char) 0xD7,
+ "x",
+ (char) 0xDD,
+ "Y",
+ (char) 0xDE,
+ "b",
+ (char) 0xF7,
+ "/",
+ (char) 0xFF,
+ "y",
+
+ // Latin Extended-A
+ (char) 0x132,
+ "IJ",
+ (char) 0x134,
+ "J",
+ (char) 0x170,
+ "Ue",
+ (char) 0x174,
+ "W",
+ (char) 0x17F,
+ "f",
+
+ // Greek
+ (char) 0x374, "'", (char) 0x375, ",", (char) 0x37A, ",",
+ (char) 0x37E, ";", (char) 0x384, "'", (char) 0x385, "'",
+ (char) 0x386, "A", (char) 0x387, ".", (char) 0x388, "E",
+ (char) 0x380, "I", (char) 0x38C, "O", (char) 0x38E, "Y",
+ (char) 0x38F, "O", (char) 0x390, "i", (char) 215, "*", (char) 913,
+ "A", (char) 914, "B", (char) 915, "G", (char) 916, "D", (char) 917,
+ "E", (char) 918, "Z", (char) 919, "E", (char) 920, "Th",
+ (char) 921,
+ "I",
+ (char) 922,
+ "K",
+ (char) 923,
+ "L",
+ (char) 924,
+ "M",
+ (char) 925,
+ "N",
+ (char) 926,
+ "X",
+ (char) 927,
+ "O",
+ (char) 928,
+ "P",
+ (char) 929,
+ "R",
+ (char) 931,
+ "S",
+ (char) 932,
+ "T",
+ (char) 933,
+ "Y",
+ (char) 934,
+ "Ph",
+ (char) 935,
+ "Ch",
+ (char) 936,
+ "Ps",
+ (char) 937,
+ "O",
+ (char) 977,
+ "th",
+ (char) 978,
+ "y",
+ (char) 982,
+ "pi",
+
+ // General Punctuation
+ (char) 0x2013, "-", (char) 0x2016, "||", (char) 0x2017, "_",
+ (char) 0x2020, "+", (char) 0x2021, "++", (char) 0x2022, "*",
+ (char) 0x2023, "*", (char) 0x2024, ".", (char) 0x2025, "..",
+ (char) 0x2026, "...", (char) 0x2027, ".", (char) 0x2028, "\n",
+ (char) 0x2030, "/1000", (char) 0x2031, "/10000", (char) 0x2032,
+ "'", (char) 0x2033, "''", (char) 0x2034, "'''", (char) 0x2035, "'",
+ (char) 0x2036, "''", (char) 0x2037, "'''", (char) 0x2038, "^",
+ (char) 0x2039, "\"", (char) 0x203A, "\"", (char) 0x203B, "*",
+ (char) 0x203C, "!!", (char) 0x203D, "?!", (char) 0x2041, ",",
+ (char) 0x2042, "***", (char) 0x2043, "-", (char) 0x2044, "/",
+ (char) 0x2045, "[", (char) 0x2046, "]", (char) 0x2047, "??",
+ (char) 0x2048, "?!", (char) 0x2049, "!?",
+ (char) 0x204A,
+ "-",
+ (char) 0x204B,
+ "P",
+ (char) 0x204C,
+ "<",
+ (char) 0x204D,
+ ">",
+ (char) 0x204F,
+ ";",
+ (char) 0x2050,
+ "-",
+ (char) 0x2051,
+ "**",
+ (char) 0x2052,
+ "./.",
+ (char) 0x2053,
+ "~",
+ (char) 0x2054,
+ "_",
+ (char) 0x2055,
+ "_",
+
+ // Mathematical symbols
+ (char) 8465, "I", (char) 8476, "R", (char) 8482, "(TM)",
+ (char) 8501, "a", (char) 8592, "<-", (char) 8593, "^", (char) 8594,
+ "->", (char) 8595, "v", (char) 8596, "<->", (char) 8629, "<-'",
+ (char) 8656, "<=", (char) 8657, "^", (char) 8658, "=>",
+ (char) 8659, "v", (char) 8660, "<=>", (char) 8704, "FOR ALL",
+ (char) 8706, "d", (char) 8707, "EXIST", (char) 8709, "{}",
+ (char) 8712, "IN", (char) 8713, "NOT IN", (char) 8715, "CONTAINS",
+ (char) 8719, "PRODUCT", (char) 8721, "SUM", (char) 8722, "-",
+ (char) 8727, "*", (char) 8730, "SQRT", (char) 8733, "~",
+ (char) 8734, "INF", (char) 8736, "angle", (char) 8743, "&",
+ (char) 8744, "|", (char) 8745, "INTERSECTION", (char) 8746,
+ "UNION", (char) 8747, "INTEGRAL", (char) 8756, "=>", (char) 8764,
+ "~", (char) 8773, "~=", (char) 8776, "~=", (char) 8800, "!=",
+ (char) 8801, "==", (char) 8804, "=<", (char) 8805, ">=",
+ (char) 8834, "SUBSET OF", (char) 8835, "SUPERSET OF", (char) 8836,
+ "NOT SUBSET OF", (char) 8838, "SUBSET OR EQUAL", (char) 8839,
+ "SUPERSET OR EQUAL", (char) 8853, "(+)", (char) 8855, "(*)",
+ (char) 8869, "_|_", (char) 8901, "*", (char) 8364,
+ "EUR",
+
+ // Ligatures
+ (char) 0xFB00, "ff", (char) 0xFB01, "fi", (char) 0xFB02, "fl",
+ (char) 0xFB03, "ffi", (char) 0xFB04, "ffl", (char) 0xFB05, "ft",
+ (char) 0xFB06, "st");
+
+ /** Normalizes a character to a String of characters in the range 0x20-0x7F.
+ * Returns a String, because some characters are
+ * normalized to multiple characters (e.g. umlauts) and
+ * some characters are normalized to zero characters (e.g. special Unicode space chars).
+ * Returns null for the EndOfFile character -1 */
+ public static String normalize(int c) {
+ // EOF
+ if (c == -1)
+ return (null);
+
+ // ASCII chars
+ if (c >= ' ' && c <= 128)
+ return ("" + (char) c);
+
+ // Upper case
+ boolean u = Character.isUpperCase(c);
+ char cu = (char) Character.toUpperCase(c);
+
+ // Check map
+ if (normalizeMap.get(cu) != null)
+ return (u ? normalizeMap.get(cu) : normalizeMap.get(cu)
+ .toLowerCase());
+
+ // ASCII
+ if (c < ' ')
+ return ("");
+
+ // Latin-1
+ if (cu >= 0xC0 && cu <= 0xC5)
+ return (u ? "A" : "a");
+ if (cu >= 0xC8 && cu <= 0xCB)
+ return (u ? "E" : "e");
+ if (cu >= 0xCC && cu <= 0xCF)
+ return (u ? "I" : "i");
+ if (cu >= 0xD2 && cu <= 0xD8)
+ return (u ? "O" : "o");
+ if (cu >= 0x80 && cu <= 0xA0)
+ return (" ");
+
+ // Latin Extended-A
+ if (cu >= 0x100 && cu <= 0x105)
+ return (u ? "A" : "a");
+ if (cu >= 0x106 && cu <= 0x10D)
+ return (u ? "C" : "c");
+ if (cu >= 0x10E && cu <= 0x111)
+ return (u ? "D" : "d");
+ if (cu >= 0x112 && cu <= 0x11B)
+ return (u ? "E" : "e");
+ if (cu >= 0x11C && cu <= 0x123)
+ return (u ? "G" : "g");
+ if (cu >= 0x124 && cu <= 0x127)
+ return (u ? "H" : "h");
+ if (cu >= 0x128 && cu <= 0x131)
+ return (u ? "I" : "i");
+ if (cu >= 0x136 && cu <= 0x138)
+ return (u ? "K" : "k");
+ if (cu >= 0x139 && cu <= 0x142)
+ return (u ? "L" : "l");
+ if (cu >= 0x143 && cu <= 0x14B)
+ return (u ? "N" : "n");
+ if (cu >= 0x14C && cu <= 0x14F)
+ return (u ? "O" : "o");
+ if (cu >= 0x150 && cu <= 0x153)
+ return (u ? "Oe" : "oe");
+ if (cu >= 0x156 && cu <= 0x159)
+ return (u ? "R" : "r");
+ if (cu >= 0x15A && cu <= 0x161)
+ return (u ? "S" : "s");
+ if (cu >= 0x161 && cu <= 0x167)
+ return (u ? "T" : "t");
+ if (cu >= 0x176 && cu <= 0x178)
+ return (u ? "Y" : "y");
+ if (cu >= 0x179 && cu <= 0x17E)
+ return (u ? "Z" : "z");
+
+ // General Punctuation
+ if (cu >= 0x2000 && cu <= 0x200A)
+ return (" ");
+ if (cu >= 0x200B && cu <= 0x200F)
+ return ("");
+ if (cu >= 0x2010 && cu <= 0x2015)
+ return ("--");
+ if (cu >= 0x2018 && cu <= 0x201B)
+ return ("'");
+ if (cu >= 0x201C && cu <= 0x201F)
+ return ("\"");
+ if (cu >= 0x2029 && cu <= 0x202F)
+ return (" ");
+ if (cu >= 0x203E && cu <= 0x2040)
+ return ("-");
+ if (cu >= 0x2056 && cu <= 0x205E)
+ return (".");
+
+ return (UNKNOWN);
+ }
+
+ /** Decodes percentage characters of the form "%xx" in a string. Also does UTF8 decoding.*/
+ public static String decodePercentage(String string) {
+ int pos = string.indexOf('%');
+ if (pos == -1)
+ return (string);
+ StringBuilder result = new StringBuilder(string.length());
+ for (int i = 0; i < string.length(); i++) {
+ if (string.charAt(i) != '%' || i > string.length() - 3) {
+ result.append(string.charAt(i));
+ continue;
+ }
+ try {
+ result.append((char) Integer.parseInt(
+ string.substring(i + 1, i + 3), 16));
+ i += 2;
+ } catch (Exception e) {
+ // Illegal percentage code, just keep it literally
+ result.append('%');
+ }
+ }
+ return (decodeUtf8(result.toString()));
+ }
+
+ /** Decodes an HTML ampersand code such as "& amp", returns -1 in case of failure*/
+ public static int decodeAmpersandChar(String b) {
+ // Get just the code portion
+ if (b.startsWith("&"))
+ b = b.substring(1);
+ if (b.endsWith(";"))
+ b = cutLast(b);
+ // Hexadecimal characters
+ if (b.startsWith("#x")) {
+ try {
+ return (((char) Integer.parseInt(b.substring(2), 16)));
+ } catch (Exception e) {
+ // Invalid char
+ return (-1);
+ }
+ }
+ // Decimal characters
+ if (b.startsWith("#")) {
+ try {
+ return (((char) Integer.parseInt(b.substring(1))));
+ } catch (Exception e) {
+ // Invalid char
+ return (-1);
+ }
+ }
+ // Others
+ if (ampersandMap.get(b) != -1) {
+ return (ampersandMap.get(b));
+ } else if (ampersandMap.get(b.toLowerCase()) != -1) {
+ return (ampersandMap.get(b.toLowerCase()));
+ }
+ // Invalid char
+ return (-1);
+ }
+
+ /** Eats an HTML ampersand code from a List of Characters*/
+ public static String decodeAmpersand(String string) {
+ int pos = string.indexOf('&');
+ if (pos == -1)
+ return (string);
+ StringBuilder result = new StringBuilder(string.length());
+ for (int i = 0; i < string.length(); i++) {
+ if (string.charAt(i) != '&' || i > string.length() - 2) {
+ result.append(string.charAt(i));
+ continue;
+ }
+ // Seek to ';'
+ // We also accept spaces and the end of the String as a delimiter
+ int end = i;
+ while (end < string.length()
+ && !Character.isSpaceChar(string.charAt(end))
+ && string.charAt(end) != ';')
+ end++;
+ String b = string.substring(i + 1, end);
+ if (end < string.length() && string.charAt(end) == ';')
+ end++;
+ int c = decodeAmpersandChar(b);
+ if (c == -1) {
+ result.append('&');
+ } else {
+ result.append((char) c);
+ i = end - 1;
+ }
+ }
+ return (result.toString());
+ }
+
+ /** Tells from the first UTF-8 code character how long the code is.
+ * Returns -1 if the character is not an UTF-8 code start.
+ * Returns 1 if the character is ASCII<128*/
+ public static int Utf8Length(char c) {
+ // 0xxx xxxx
+ if ((c & 0x80) == 0x00)
+ return (1);
+ // 110x xxxx
+ if ((c & 0xE0) == 0xC0)
+ return (2);
+ // 1110 xxxx
+ if ((c & 0xF0) == 0xE0)
+ return (3);
+ // 1111 0xxx
+ if ((c & 0xF8) == 0xF0)
+ return (4);
+ return (-1);
+ }
+
+ /** Decodes UTF8 characters in a string. There is also a built-in way in Java that converts
+ * UTF8 to characters and back, but it does not work with all characters. */
+ public static String decodeUtf8(String string) {
+ if (string.isEmpty())
+ return (string);
+ StringBuilder result = new StringBuilder(string.length());
+ for (int i = 0; i < string.length(); i++) {
+ int len = Utf8Length(string.charAt(i));
+ if (string.length() - i >= len)
+ switch (len) {
+ case 2:
+ if ((string.charAt(i + 1) & 0xC0) != 0x80)
+ break;
+ result.append((char) (((string.charAt(i) & 0x1F) << 6) + (string
+ .charAt(i + 1) & 0x3F)));
+ i++;
+ continue;
+ case 3:
+ if ((string.charAt(i + 1) & 0xC0) != 0x80
+ || (string.charAt(i + 2) & 0xC0) != 0x80)
+ break;
+ result.append((char) (((string.charAt(i + 0) & 0x0F) << 12)
+ + ((string.charAt(i + 1) & 0x3F) << 6) + ((string
+ .charAt(i + 2) & 0x3F))));
+ i += 2;
+ continue;
+ case 4:
+ if ((string.charAt(i + 1) & 0xC0) != 0x80
+ || (string.charAt(i + 2) & 0xC0) != 0x80
+ || (string.charAt(i + 3) & 0xC0) != 0x80)
+ break;
+ result.append((char) (((string.charAt(i + 0) & 0x07) << 18)
+ + ((string.charAt(i + 1) & 0x3F) << 12)
+ + ((string.charAt(i + 2) & 0x3F) << 6) + ((string
+ .charAt(i + 3) & 0x3F))));
+ i += 3;
+ continue;
+ }
+ result.append(string.charAt(i));
+ }
+ return (result.toString());
+ }
+
+ /** Used for encoding selected characters*/
+ public static interface Legal {
+ public boolean isLegal(char c);
+ }
+
+ /** Encodes with backslash all illegal characters*/
+ public static String encodeBackslash(CharSequence s, Legal legal) {
+ StringBuilder b = new StringBuilder((int) (s.length() * 1.5));
+ for (int i = 0; i < s.length(); i++) {
+ if (legal.isLegal(s.charAt(i)) && s.charAt(i) != '\\') {
+ b.append(s.charAt(i));
+ } else {
+ if (charToBackslash.containsKey(s.charAt(i))) {
+ b.append(charToBackslash.get(s.charAt(i)));
+ continue;
+ }
+ b.append("\\u");
+ String hex = Integer.toHexString(s.charAt(i));
+ for (int j = 0; j < 4 - hex.length(); j++)
+ b.append('0');
+ b.append(hex);
+ }
+ }
+ return (b.toString());
+ }
+
+ /** Decodes a backslash sequence*/
+ public static String decodeBackslash(String string) {
+ int pos = string.indexOf('\\');
+ if (pos == -1)
+ return (string);
+ StringBuilder result = new StringBuilder(string.length());
+ for (int i = 0; i < string.length(); i++) {
+ if (string.charAt(i) != '\\' || i + 1 >= string.length()) {
+ result.append(string.charAt(i));
+ continue;
+ }
+ char nextChar = string.charAt(i + 1);
+ switch (nextChar) {
+ case 'u': // Unicodes BS u XXXX
+ try {
+ result.append((char) Integer.parseInt(
+ string.substring(i + 2, i + 6), 16));
+ i += 5;
+ } catch (Exception e) {
+ // Invalid char
+ result.append('\\');
+ }
+ continue;
+ case 'b':
+ result.append((char) 8);
+ i++;
+ continue;
+ case 't':
+ result.append((char) 9);
+ i++;
+ continue;
+ case 'n':
+ result.append((char) 10);
+ i++;
+ continue;
+ case 'f':
+ result.append((char) 12);
+ i++;
+ continue;
+ case 'r':
+ result.append((char) 13);
+ i++;
+ continue;
+ case '\\':
+ result.append('\\');
+ i++;
+ continue;
+ case '"':
+ result.append('"');
+ i++;
+ continue;
+ case '\'':
+ result.append('\'');
+ i++;
+ continue;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7': // Octal codes
+ int end = i + 1;
+ while (end < string.length() && string.charAt(end) >= '0'
+ && string.charAt(end) <= '8')
+ end++;
+ try {
+ result.append((char) Integer.parseInt(
+ string.substring(i + 1, end), 8));
+ i = end - 1;
+ } catch (Exception e) {
+ // Wrong code
+ result.append('\\');
+ }
+ continue;
+ default:
+ result.append('\\');
+ continue;
+ }
+ }
+ return (result.toString());
+ }
+
+ /** Replaces all codes in a String by the 16 bit Unicode characters */
+ public static String decode(String string) {
+ return (decodeAmpersand(decodeUtf8(decodeBackslash(decodePercentage(string)))));
+ }
+
+ /** Encodes a character to UTF8*/
+ public static String encodeUTF8(char c) {
+ if (c <= 0x7F)
+ return ("" + (char) c);
+ else if (c <= 0x7FF)
+ return ("" + (char) (0xC0 + ((c >> 6) & 0x1F)) + (char) (0x80 + (c & 0x3F)));
+ else
+ return ("" + (char) (0xE0 + ((c >> 12) & 0x0F))
+ + (char) (0x80 + ((c >> 6) & 0x3F)) + (char) (0x80 + (c & 0x3F)));
+ }
+
+ /** Encodes a string to UTF8*/
+ public static String encodeUTF8(String string) {
+ StringBuilder result = new StringBuilder(string.length() * 12 / 10);
+ for (int i = 0; i < string.length(); i++) {
+ char c = string.charAt(i);
+ result.append(encodeUTF8(c));
+ }
+ return (result.toString());
+ }
+
+ /** Encodes non-legal characters to Ampersand codes*/
+ public static String encodeAmpersand(String string, Legal legal) {
+ StringBuilder result = new StringBuilder(string.length() * 12 / 10);
+ for (int i = 0; i < string.length(); i++) {
+ char c = string.charAt(i);
+ if (legal.isLegal(c) && c != '&') {
+ result.append(c);
+ continue;
+ }
+ String s = charToAmpersand.get(c);
+ if (s != null) {
+ result.append(s);
+ continue;
+ }
+ result.append("" + ((int) c) + ";");
+ }
+ return (result.toString());
+ }
+
+ /** Encodes a string into Percentage codes. Also does UTF8 encoding.*/
+ public static String encodePercentage(String string, Legal legal) {
+ string = encodeUTF8(string);
+ StringBuilder result = new StringBuilder(string.length() * 12 / 10);
+ for (int i = 0; i < string.length(); i++) {
+ char c = string.charAt(i);
+ if (legal.isLegal(c) && c != '%') {
+ result.append(c);
+ continue;
+ }
+ if (c < 16)
+ result.append("%0" + Integer.toHexString(c).toUpperCase());
+ else
+ result.append("%" + Integer.toHexString(c).toUpperCase());
+ }
+ return (result.toString());
+ }
+
+ /**
+ * Encodes a String with reserved XML characters into a valid xml string for attributes.
+ * @param str
+ * @return
+ */
+ public static String encodeXmlAttribute(String str) {
+ if (str == null)
+ return null;
+ int len = str.length();
+ if (len == 0)
+ return str;
+ StringBuffer encoded = new StringBuffer();
+ for (int i = 0; i < len; i++) {
+ char c = str.charAt(i);
+ if (c == '<')
+ encoded.append("<");
+ else if (c == '\"')
+ encoded.append(""");
+ else if (c == '>')
+ encoded.append(">");
+ else if (c == '\'')
+ encoded.append("'");
+ else if (c == '&')
+ encoded.append("&");
+ else
+ encoded.append(c);
+ }
+ return encoded.toString();
+ }
+
+ /** Returns an HTML-String of the String */
+ public static String toHTML(String s) {
+ return (Char17.encodeAmpersand(s, alphaNumericAndSpace).replace(
+ "
", "
"));
+ }
+
+ /** Replaces illegal characters in the string by hex codes (cannot be undone)*/
+ public static String encodeHex(String s, Legal legal) {
+ StringBuilder result = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ if (legal.isLegal(c))
+ result.append(c);
+ else
+ result.append(Integer.toHexString(s.charAt(i)).toUpperCase());
+ }
+ return (result.toString());
+ }
+
+ /** Tells whether a char is in a range*/
+ public static boolean in(char c, char a, char b) {
+ return (c >= a && c <= b);
+ }
+
+ /** Tells whether a char is in a string*/
+ public static boolean in(char c, String s) {
+ return (s.indexOf(c) != -1);
+ }
+
+ /** Tells whether a char is alphanumeric in the sense of URIs*/
+ public static boolean isAlphanumeric(char c) {
+ return (in(c, 'a', 'z') || in(c, 'A', 'Z') || in(c, '0', '9'));
+ }
+
+ /** Tells whether a char is reserved in the sense of URIs*/
+ public static boolean isReserved(char c) {
+ return (isSubDelim(c) || isGenDelim(c));
+ }
+
+ /** Tells whether a char is unreserved in the sense of URIs (not the same as !reserved)*/
+ public static boolean isUnreserved(char c) {
+ return (isAlphanumeric(c) || in(c, "-._~"));
+ }
+
+ /** Tells whether a string is escaped in the sense of URIs*/
+ public static boolean isEscaped(String s) {
+ return (s.matches("%[0-9A-Fa-f]{2}"));
+ }
+
+ /** Tells whether a char is a sub-delimiter in the sense of URIs*/
+ public static boolean isSubDelim(char c) {
+ return (in(c, "!$&'()*+,="));
+ }
+
+ /** Tells whether a char is a general delimiter in the sense of URIs*/
+ public static boolean isGenDelim(char c) {
+ return (in(c, ":/?#[]@"));
+ }
+
+ /** Tells whether a char is a valid path component in the sense of URIs*/
+ public static boolean isPchar(char c) {
+ return (isUnreserved(c) || isSubDelim(c) || in(c, "@"));
+ }
+
+ /** Legal path components in the sense of URIs*/
+ public static final Legal uriPathComponent = new Legal() {
+ public boolean isLegal(char c) {
+ return isPchar(c);
+ }
+ };
+
+ /** True for ASCII alphanumeric and space*/
+ public static final Legal alphaNumericAndSpace = new Legal() {
+ public boolean isLegal(char c) {
+ return isAlphanumeric(c) || c == ' ';
+ }
+ };
+
+ /** Encodes a char to percentage code, if it is not a path character in the sense of URIs*/
+ public static String encodeURIPathComponent(String s) {
+ return (encodePercentage(s, uriPathComponent));
+ }
+
+ /** TRUE for XML path components*/
+ public static final Legal xmlPathComponent = new Legal() {
+ public boolean isLegal(char c) {
+ return isPchar(c) && c != '&' && c != '"';
+ }
+ };
+
+ /** Encodes a char to percentage code, if it is not a path character in the sense of XMLs*/
+ public static String encodeURIPathComponentXML(String string) {
+ return (encodePercentage(string, xmlPathComponent));
+ }
+
+ /** Decodes a URI path component*/
+ public static String decodeURIPathComponent(String s) {
+ return (Char17.decodePercentage(s));
+ }
+
+ /** Decodes all codes in a String and normalizes all chars */
+ public static String decodeAndNormalize(String s) {
+ return (normalize(decode(s)));
+ }
+
+ /** Normalizes all chars in a String to characters 0x20-0x7F */
+ public static String normalize(String s) {
+ StringBuilder b = new StringBuilder();
+ for (int i = 0; i < s.length(); i++)
+ b.append(normalize(s.charAt(i)));
+ return (b.toString());
+ }
+
+ /** Returns the last character of a String or 0*/
+ public static char last(CharSequence s) {
+ return (s.length() == 0 ? (char) 0 : s.charAt(s.length() - 1));
+ }
+
+ /** Returns the String without the last character */
+ public static String cutLast(String s) {
+ return (s.length() == 0 ? "" : s.substring(0, s.length() - 1));
+ }
+
+ /** Cuts the last character */
+ public static StringBuilder cutLast(StringBuilder s) {
+ s.setLength(s.length() - 1);
+ return (s);
+ }
+
+ /** Upcases the first character in a String*/
+ public static String upCaseFirst(String s) {
+ if (s == null || s.length() == 0)
+ return (s);
+ return (Character.toUpperCase(s.charAt(0)) + s.substring(1));
+ }
+
+ /** Lowcases the first character in a String*/
+ public static String lowCaseFirst(String s) {
+ if (s == null || s.length() == 0)
+ return (s);
+ return (Character.toLowerCase(s.charAt(0)) + s.substring(1));
+ }
+
+ /** Returns a string of the given length, fills with spaces if necessary */
+ public static CharSequence truncate(CharSequence s, int len) {
+ if (s.length() == len)
+ return (s);
+ if (s.length() > len)
+ return (s.subSequence(0, len));
+ StringBuilder result = new StringBuilder(s);
+ while (result.length() < len)
+ result.append(' ');
+ return (result);
+ }
+
+ /** Capitalizes words and lowercases the rest*/
+ public static String capitalize(String s) {
+ StringBuilder result = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ if (i == 0 || i > 0 && !Character.isLetterOrDigit(s.charAt(i - 1)))
+ c = Character.toUpperCase(c);
+ else
+ c = Character.toLowerCase(c);
+ result.append(c);
+ }
+ return (result.toString());
+ }
+
+ /** TRUE if the Charsequence ends with the string */
+ public static boolean endsWith(CharSequence s, String end) {
+ return (s.length() >= end.length() && s.subSequence(
+ s.length() - end.length(), s.length()).equals(end));
+ }
+
+ /** IndexOf */
+ public static int indexOf(char c, CharSequence string) {
+ for (int i = 0; i < string.length(); i++) {
+ if (string.charAt(i) == c)
+ return (i);
+ }
+ return (-1);
+ }
+
+ /** Prints a test case*/
+ public static boolean testCase(String name, String s1, String s2) {
+ if (s1.equals(s2)) {
+ System.out.println(name + ": OK");
+ return (true);
+ } else {
+ System.out.println(name + ": " + s1 + "!=" + s2);
+ return (false);
+ }
+ }
+
+ /** Tests all methods*/
+ public static void test() {
+ String in, out;
+ testCase("EncAmp",
+ encodeAmpersand(in = "a+A&b�", alphaNumericAndSpace),
+ out = "a+A&bä");
+ testCase("DecAmp", decodeAmpersand(out), in);
+ testCase("DecAmp'", decodeAmpersand("&&inv;ä& &"),
+ "&&inv;�& &");
+ testCase("EncBack",
+ encodeBackslash(in = "a+A\\b�\n", alphaNumericAndSpace),
+ out = "a\\u002bA\\\\b\\u00e4\\n");
+ testCase("DecBack", decodeBackslash(out), in);
+ testCase("DecBack'", decodeBackslash("\\00101a\\\\a\\n\\k\\"),
+ "Aa\\a\n\\k\\");
+ testCase("EncHex", encodeHex("a+A\\b�", alphaNumericAndSpace),
+ "a2BA5CbE4");
+ testCase("EncPerc",
+ encodePercentage(in = "a+A%b�", alphaNumericAndSpace),
+ out = "a%2BA%25b%C3%A4");
+ testCase("DecPerc", decodePercentage(out), in);
+ testCase("DecPerc'", decodePercentage("%C3%A4%X%"), "�%X%");
+ testCase("EncURI", encodeURIPathComponent(in = "a+A/b �&"),
+ out = "a+A%2Fb%20%C3%A4&");
+ testCase("DecURI", decodeURIPathComponent(out), in);
+ testCase("EncXML", encodeURIPathComponentXML("a+A/b �&"),
+ "a+A%2Fb%20%C3%A4%26");
+ testCase("EncUTF8", encodeUTF8("a+A/b �<"), "a+A/b ä<");
+ }
+
+ /** Test routine */
+ public static void main(String argv[]) throws Exception {
+ test();
+ }
+
+}