From e335fefca378a762822495b709f061275810b802 Mon Sep 17 00:00:00 2001 From: gadelrab Date: Tue, 16 Feb 2016 11:00:35 +0100 Subject: [PATCH] new datahandlers --- .../mpi/tools/javatools/parsers/Char17.java | 1063 +++++++++++++++++ 1 file changed, 1063 insertions(+) create mode 100644 src/main/java/mpi/tools/javatools/parsers/Char17.java diff --git a/src/main/java/mpi/tools/javatools/parsers/Char17.java b/src/main/java/mpi/tools/javatools/parsers/Char17.java new file mode 100644 index 0000000..34f9dd2 --- /dev/null +++ b/src/main/java/mpi/tools/javatools/parsers/Char17.java @@ -0,0 +1,1063 @@ +package mpi.tools.javatools.parsers; + +import java.util.Arrays; + +import mpi.tools.javatools.datatypes.IntHashMap; +import mpi.tools.javatools.datatypes.IntKeyMap; + +/** +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). +It is licensed under the Creative Commons Attribution License +(see http://creativecommons.org/licenses/by/3.0) by +the YAGO-NAGA team (see http://mpii.de/yago-naga). +

+This class is the improved version of the original Char class, +which takes into account that Java 1.7 performs a copy for substring() +

+This class provides static methods to decode, encode and normalize Strings.
+Decoding converts the following codes to Java 16-bit characters (char): +

+

+Encoding is the inverse operation. It takes a Java 16-bit character (char) and +outputs its encoding in HTML, as a backslash code, as a percentage code or in UTF8. +

+Normalization converts the following Unicode characters (Java 16-bit chars) +to ASCII-characters in the range 0x20-0x7F: +

+ */ +public class Char17 { + + /** String returned by the default implementation of defaultNormalizer, "[?]"*/ + public static String UNKNOWN = "[?]"; + + /** Maps a special character to a HTML ampersand sequence */ + public static IntKeyMap charToAmpersand = new IntKeyMap( + '&', "&", '\'', "'", '<', "<", '>', ">", '"', + """); + + /** Maps a special character to a backslash sequence */ + public static IntKeyMap charToBackslash = new IntKeyMap( + '\\', "\\\\", '\n', "\\n"); + + /** Maps HTML ampersand sequences to strings */ + public static IntHashMap ampersandMap = new IntHashMap( + Arrays.asList((Object) "nbsp", (char) 160, "iexcl", (char) 161, + "cent", (char) 162, "pound", (char) 163, "curren", + (char) 164, "yen", (char) 165, "brvbar", (char) 166, + "sect", (char) 167, "uml", (char) 168, "copy", (char) 169, + "ordf", (char) 170, "laquo", (char) 171, "not", (char) 172, + "shy", (char) 173, "reg", (char) 174, "macr", (char) 175, + "deg", (char) 176, "plusmn", (char) 177, "sup2", + (char) 178, "sup3", (char) 179, "acute", (char) 180, + "micro", (char) 181, "para", (char) 182, "middot", + (char) 183, "cedil", (char) 184, "sup1", (char) 185, + "ordm", (char) 186, "raquo", (char) 187, "frac14", + (char) 188, "frac12", (char) 189, "frac34", (char) 190, + "iquest", (char) 191, "Agrave", (char) 192, "Aacute", + (char) 193, "Acirc", (char) 194, "Atilde", (char) 195, + "Auml", (char) 196, "Aring", (char) 197, "AElig", + (char) 198, "Ccedil", (char) 199, "Egrave", (char) 200, + "Eacute", (char) 201, "Ecirc", (char) 202, "Euml", + (char) 203, "Igrave", (char) 204, "Iacute", (char) 205, + "Icirc", (char) 206, "Iuml", (char) 207, "ETH", (char) 208, + "Ntilde", (char) 209, "Ograve", (char) 210, "Oacute", + (char) 211, "Ocirc", (char) 212, "Otilde", (char) 213, + "Ouml", (char) 214, "times", (char) 215, "Oslash", + (char) 216, "Ugrave", (char) 217, "Uacute", (char) 218, + "Ucirc", (char) 219, "Uuml", (char) 220, "Yacute", + (char) 221, "THORN", (char) 222, "szlig", (char) 223, + "agrave", (char) 224, "aacute", (char) 225, "acirc", + (char) 226, "atilde", (char) 227, "auml", (char) 228, + "aring", (char) 229, "aelig", (char) 230, "ccedil", + (char) 231, "egrave", (char) 232, "eacute", (char) 233, + "ecirc", (char) 234, "euml", (char) 235, "igrave", + (char) 236, "iacute", (char) 237, "icirc", (char) 238, + "iuml", (char) 239, "eth", (char) 240, "ntilde", + (char) 241, "ograve", (char) 242, "oacute", (char) 243, + "ocirc", (char) 244, "otilde", (char) 245, "ouml", + (char) 246, "divide", (char) 247, "oslash", (char) 248, + "ugrave", (char) 249, "uacute", (char) 250, "ucirc", + (char) 251, "uuml", (char) 252, "yacute", (char) 253, + "thorn", (char) 254, "yuml", (char) 255, "fnof", + (char) 402, "Alpha", (char) 913, "Beta", (char) 914, + "Gamma", (char) 915, "Delta", (char) 916, "Epsilon", + (char) 917, "Zeta", (char) 918, "Eta", (char) 919, "Theta", + (char) 920, "Iota", (char) 921, "Kappa", (char) 922, + "Lambda", (char) 923, "Mu", (char) 924, "Nu", (char) 925, + "Xi", (char) 926, "Omicron", (char) 927, "Pi", (char) 928, + "Rho", (char) 929, "Sigma", (char) 931, "Tau", (char) 932, + "Upsilon", (char) 933, "Phi", (char) 934, "Chi", + (char) 935, "Psi", (char) 936, "Omega", (char) 937, + "alpha", (char) 945, "beta", (char) 946, "gamma", + (char) 947, "delta", (char) 948, "epsilon", (char) 949, + "zeta", (char) 950, "eta", (char) 951, "theta", (char) 952, + "iota", (char) 953, "kappa", (char) 954, "lambda", + (char) 955, "mu", (char) 956, "nu", (char) 957, "xi", + (char) 958, "omicron", (char) 959, "pi", (char) 960, "rho", + (char) 961, "sigmaf", (char) 962, "sigma", (char) 963, + "tau", (char) 964, "upsilon", (char) 965, "phi", + (char) 966, "chi", (char) 967, "psi", (char) 968, "omega", + (char) 969, "thetasym", (char) 977, "upsih", (char) 978, + "piv", (char) 982, "bull", (char) 8226, "hellip", + (char) 8230, "prime", (char) 8242, "Prime", (char) 8243, + "oline", (char) 8254, "frasl", (char) 8260, "weierp", + (char) 8472, "image", (char) 8465, "real", (char) 8476, + "trade", (char) 8482, "alefsym", (char) 8501, "larr", + (char) 8592, "uarr", (char) 8593, "rarr", (char) 8594, + "darr", (char) 8595, "harr", (char) 8596, "crarr", + (char) 8629, "lArr", (char) 8656, "uArr", (char) 8657, + "rArr", (char) 8658, "dArr", (char) 8659, "hArr", + (char) 8660, "forall", (char) 8704, "part", (char) 8706, + "exist", (char) 8707, "empty", (char) 8709, "nabla", + (char) 8711, "isin", (char) 8712, "notin", (char) 8713, + "ni", (char) 8715, "prod", (char) 8719, "sum", (char) 8721, + "minus", (char) 8722, "lowast", (char) 8727, "radic", + (char) 8730, "prop", (char) 8733, "infin", (char) 8734, + "ang", (char) 8736, "and", (char) 8743, "or", (char) 8744, + "cap", (char) 8745, "cup", (char) 8746, "int", (char) 8747, + "there4", (char) 8756, "sim", (char) 8764, "cong", + (char) 8773, "asymp", (char) 8776, "ne", (char) 8800, + "equiv", (char) 8801, "le", (char) 8804, "ge", (char) 8805, + "sub", (char) 8834, "sup", (char) 8835, "nsub", + (char) 8836, "sube", (char) 8838, "supe", (char) 8839, + "oplus", (char) 8853, "otimes", (char) 8855, "perp", + (char) 8869, "sdot", (char) 8901, "lceil", (char) 8968, + "rceil", (char) 8969, "lfloor", (char) 8970, "rfloor", + (char) 8971, "lang", (char) 9001, "rang", (char) 9002, + "loz", (char) 9674, "spades", (char) 9824, "clubs", + (char) 9827, "hearts", (char) 9829, "diams", (char) 9830, + "quot", (char) 34, "amp", (char) 38, "lt", (char) 60, "gt", + (char) 62, "OElig", (char) 338, "oelig", (char) 339, + "Scaron", (char) 352, "scaron", (char) 353, "Yuml", + (char) 376, "circ", (char) 710, "tilde", (char) 732, + "ensp", (char) 8194, "emsp", (char) 8195, "thinsp", + (char) 8201, "zwnj", (char) 8204, "zwj", + (char) 8205, + "lrm", + (char) 8206, + "rlm", + (char) 8207, + "ndash", + (char) 8211, // 0x2013 + "mdash", (char) 8212, "lsquo", (char) 8216, "rsquo", + (char) 8217, "sbquo", (char) 8218, "ldquo", (char) 8220, + "rdquo", (char) 8221, "bdquo", (char) 8222, "dagger", + (char) 8224, "Dagger", (char) 8225, "permil", (char) 8240, + "lsaquo", (char) 8249, "rsaquo", (char) 8250, "euro", + (char) 8364, "apos", '\'')); + + /** Maps characters to normalizations */ + public static IntKeyMap normalizeMap = new IntKeyMap( + // ASCII + (char) 7, + "BEEP", + (char) 9, + " ", + (char) 10, + "\n", + + // Latin-1 + (char) 160, + " ", + (char) 161, + "!", + (char) 162, + "cent", + (char) 163, + "pound", + (char) 164, + "currency", + (char) 165, + "yen", + (char) 166, + "|", + (char) 167, + "/", + (char) 169, + "(c)", + (char) 170, + "^a", + (char) 171, + "\"", + (char) 172, + "~", + (char) 173, + "", + (char) 174, + "(R)", + (char) 176, + "degree", + (char) 177, + "+/-", + (char) 178, + "^2", + (char) 179, + "^3", + (char) 180, + "'", + (char) 181, + "mu", + (char) 182, + "P", + (char) 183, + ".", + (char) 184, + ",", + (char) 185, + "^1", + (char) 186, + "^o", + (char) 187, + "\"", + (char) 188, + "1/4", + (char) 189, + "1/2", + (char) 190, + "3/4", + (char) 191, + "?", + (char) 0xC4, + "Ae", + (char) 0xD6, + "Oe", + (char) 0xDC, + "Ue", + (char) 0xDF, + "ss", + (char) 0xC6, + "Ae", + (char) 0xC7, + "C", + (char) 0xD0, + "D", + (char) 0xD1, + "N", + (char) 0xD7, + "x", + (char) 0xDD, + "Y", + (char) 0xDE, + "b", + (char) 0xF7, + "/", + (char) 0xFF, + "y", + + // Latin Extended-A + (char) 0x132, + "IJ", + (char) 0x134, + "J", + (char) 0x170, + "Ue", + (char) 0x174, + "W", + (char) 0x17F, + "f", + + // Greek + (char) 0x374, "'", (char) 0x375, ",", (char) 0x37A, ",", + (char) 0x37E, ";", (char) 0x384, "'", (char) 0x385, "'", + (char) 0x386, "A", (char) 0x387, ".", (char) 0x388, "E", + (char) 0x380, "I", (char) 0x38C, "O", (char) 0x38E, "Y", + (char) 0x38F, "O", (char) 0x390, "i", (char) 215, "*", (char) 913, + "A", (char) 914, "B", (char) 915, "G", (char) 916, "D", (char) 917, + "E", (char) 918, "Z", (char) 919, "E", (char) 920, "Th", + (char) 921, + "I", + (char) 922, + "K", + (char) 923, + "L", + (char) 924, + "M", + (char) 925, + "N", + (char) 926, + "X", + (char) 927, + "O", + (char) 928, + "P", + (char) 929, + "R", + (char) 931, + "S", + (char) 932, + "T", + (char) 933, + "Y", + (char) 934, + "Ph", + (char) 935, + "Ch", + (char) 936, + "Ps", + (char) 937, + "O", + (char) 977, + "th", + (char) 978, + "y", + (char) 982, + "pi", + + // General Punctuation + (char) 0x2013, "-", (char) 0x2016, "||", (char) 0x2017, "_", + (char) 0x2020, "+", (char) 0x2021, "++", (char) 0x2022, "*", + (char) 0x2023, "*", (char) 0x2024, ".", (char) 0x2025, "..", + (char) 0x2026, "...", (char) 0x2027, ".", (char) 0x2028, "\n", + (char) 0x2030, "/1000", (char) 0x2031, "/10000", (char) 0x2032, + "'", (char) 0x2033, "''", (char) 0x2034, "'''", (char) 0x2035, "'", + (char) 0x2036, "''", (char) 0x2037, "'''", (char) 0x2038, "^", + (char) 0x2039, "\"", (char) 0x203A, "\"", (char) 0x203B, "*", + (char) 0x203C, "!!", (char) 0x203D, "?!", (char) 0x2041, ",", + (char) 0x2042, "***", (char) 0x2043, "-", (char) 0x2044, "/", + (char) 0x2045, "[", (char) 0x2046, "]", (char) 0x2047, "??", + (char) 0x2048, "?!", (char) 0x2049, "!?", + (char) 0x204A, + "-", + (char) 0x204B, + "P", + (char) 0x204C, + "<", + (char) 0x204D, + ">", + (char) 0x204F, + ";", + (char) 0x2050, + "-", + (char) 0x2051, + "**", + (char) 0x2052, + "./.", + (char) 0x2053, + "~", + (char) 0x2054, + "_", + (char) 0x2055, + "_", + + // Mathematical symbols + (char) 8465, "I", (char) 8476, "R", (char) 8482, "(TM)", + (char) 8501, "a", (char) 8592, "<-", (char) 8593, "^", (char) 8594, + "->", (char) 8595, "v", (char) 8596, "<->", (char) 8629, "<-'", + (char) 8656, "<=", (char) 8657, "^", (char) 8658, "=>", + (char) 8659, "v", (char) 8660, "<=>", (char) 8704, "FOR ALL", + (char) 8706, "d", (char) 8707, "EXIST", (char) 8709, "{}", + (char) 8712, "IN", (char) 8713, "NOT IN", (char) 8715, "CONTAINS", + (char) 8719, "PRODUCT", (char) 8721, "SUM", (char) 8722, "-", + (char) 8727, "*", (char) 8730, "SQRT", (char) 8733, "~", + (char) 8734, "INF", (char) 8736, "angle", (char) 8743, "&", + (char) 8744, "|", (char) 8745, "INTERSECTION", (char) 8746, + "UNION", (char) 8747, "INTEGRAL", (char) 8756, "=>", (char) 8764, + "~", (char) 8773, "~=", (char) 8776, "~=", (char) 8800, "!=", + (char) 8801, "==", (char) 8804, "=<", (char) 8805, ">=", + (char) 8834, "SUBSET OF", (char) 8835, "SUPERSET OF", (char) 8836, + "NOT SUBSET OF", (char) 8838, "SUBSET OR EQUAL", (char) 8839, + "SUPERSET OR EQUAL", (char) 8853, "(+)", (char) 8855, "(*)", + (char) 8869, "_|_", (char) 8901, "*", (char) 8364, + "EUR", + + // Ligatures + (char) 0xFB00, "ff", (char) 0xFB01, "fi", (char) 0xFB02, "fl", + (char) 0xFB03, "ffi", (char) 0xFB04, "ffl", (char) 0xFB05, "ft", + (char) 0xFB06, "st"); + + /** Normalizes a character to a String of characters in the range 0x20-0x7F. + * Returns a String, because some characters are + * normalized to multiple characters (e.g. umlauts) and + * some characters are normalized to zero characters (e.g. special Unicode space chars). + * Returns null for the EndOfFile character -1 */ + public static String normalize(int c) { + // EOF + if (c == -1) + return (null); + + // ASCII chars + if (c >= ' ' && c <= 128) + return ("" + (char) c); + + // Upper case + boolean u = Character.isUpperCase(c); + char cu = (char) Character.toUpperCase(c); + + // Check map + if (normalizeMap.get(cu) != null) + return (u ? normalizeMap.get(cu) : normalizeMap.get(cu) + .toLowerCase()); + + // ASCII + if (c < ' ') + return (""); + + // Latin-1 + if (cu >= 0xC0 && cu <= 0xC5) + return (u ? "A" : "a"); + if (cu >= 0xC8 && cu <= 0xCB) + return (u ? "E" : "e"); + if (cu >= 0xCC && cu <= 0xCF) + return (u ? "I" : "i"); + if (cu >= 0xD2 && cu <= 0xD8) + return (u ? "O" : "o"); + if (cu >= 0x80 && cu <= 0xA0) + return (" "); + + // Latin Extended-A + if (cu >= 0x100 && cu <= 0x105) + return (u ? "A" : "a"); + if (cu >= 0x106 && cu <= 0x10D) + return (u ? "C" : "c"); + if (cu >= 0x10E && cu <= 0x111) + return (u ? "D" : "d"); + if (cu >= 0x112 && cu <= 0x11B) + return (u ? "E" : "e"); + if (cu >= 0x11C && cu <= 0x123) + return (u ? "G" : "g"); + if (cu >= 0x124 && cu <= 0x127) + return (u ? "H" : "h"); + if (cu >= 0x128 && cu <= 0x131) + return (u ? "I" : "i"); + if (cu >= 0x136 && cu <= 0x138) + return (u ? "K" : "k"); + if (cu >= 0x139 && cu <= 0x142) + return (u ? "L" : "l"); + if (cu >= 0x143 && cu <= 0x14B) + return (u ? "N" : "n"); + if (cu >= 0x14C && cu <= 0x14F) + return (u ? "O" : "o"); + if (cu >= 0x150 && cu <= 0x153) + return (u ? "Oe" : "oe"); + if (cu >= 0x156 && cu <= 0x159) + return (u ? "R" : "r"); + if (cu >= 0x15A && cu <= 0x161) + return (u ? "S" : "s"); + if (cu >= 0x161 && cu <= 0x167) + return (u ? "T" : "t"); + if (cu >= 0x176 && cu <= 0x178) + return (u ? "Y" : "y"); + if (cu >= 0x179 && cu <= 0x17E) + return (u ? "Z" : "z"); + + // General Punctuation + if (cu >= 0x2000 && cu <= 0x200A) + return (" "); + if (cu >= 0x200B && cu <= 0x200F) + return (""); + if (cu >= 0x2010 && cu <= 0x2015) + return ("--"); + if (cu >= 0x2018 && cu <= 0x201B) + return ("'"); + if (cu >= 0x201C && cu <= 0x201F) + return ("\""); + if (cu >= 0x2029 && cu <= 0x202F) + return (" "); + if (cu >= 0x203E && cu <= 0x2040) + return ("-"); + if (cu >= 0x2056 && cu <= 0x205E) + return ("."); + + return (UNKNOWN); + } + + /** Decodes percentage characters of the form "%xx" in a string. Also does UTF8 decoding.*/ + public static String decodePercentage(String string) { + int pos = string.indexOf('%'); + if (pos == -1) + return (string); + StringBuilder result = new StringBuilder(string.length()); + for (int i = 0; i < string.length(); i++) { + if (string.charAt(i) != '%' || i > string.length() - 3) { + result.append(string.charAt(i)); + continue; + } + try { + result.append((char) Integer.parseInt( + string.substring(i + 1, i + 3), 16)); + i += 2; + } catch (Exception e) { + // Illegal percentage code, just keep it literally + result.append('%'); + } + } + return (decodeUtf8(result.toString())); + } + + /** Decodes an HTML ampersand code such as "& amp", returns -1 in case of failure*/ + public static int decodeAmpersandChar(String b) { + // Get just the code portion + if (b.startsWith("&")) + b = b.substring(1); + if (b.endsWith(";")) + b = cutLast(b); + // Hexadecimal characters + if (b.startsWith("#x")) { + try { + return (((char) Integer.parseInt(b.substring(2), 16))); + } catch (Exception e) { + // Invalid char + return (-1); + } + } + // Decimal characters + if (b.startsWith("#")) { + try { + return (((char) Integer.parseInt(b.substring(1)))); + } catch (Exception e) { + // Invalid char + return (-1); + } + } + // Others + if (ampersandMap.get(b) != -1) { + return (ampersandMap.get(b)); + } else if (ampersandMap.get(b.toLowerCase()) != -1) { + return (ampersandMap.get(b.toLowerCase())); + } + // Invalid char + return (-1); + } + + /** Eats an HTML ampersand code from a List of Characters*/ + public static String decodeAmpersand(String string) { + int pos = string.indexOf('&'); + if (pos == -1) + return (string); + StringBuilder result = new StringBuilder(string.length()); + for (int i = 0; i < string.length(); i++) { + if (string.charAt(i) != '&' || i > string.length() - 2) { + result.append(string.charAt(i)); + continue; + } + // Seek to ';' + // We also accept spaces and the end of the String as a delimiter + int end = i; + while (end < string.length() + && !Character.isSpaceChar(string.charAt(end)) + && string.charAt(end) != ';') + end++; + String b = string.substring(i + 1, end); + if (end < string.length() && string.charAt(end) == ';') + end++; + int c = decodeAmpersandChar(b); + if (c == -1) { + result.append('&'); + } else { + result.append((char) c); + i = end - 1; + } + } + return (result.toString()); + } + + /** Tells from the first UTF-8 code character how long the code is. + * Returns -1 if the character is not an UTF-8 code start. + * Returns 1 if the character is ASCII<128*/ + public static int Utf8Length(char c) { + // 0xxx xxxx + if ((c & 0x80) == 0x00) + return (1); + // 110x xxxx + if ((c & 0xE0) == 0xC0) + return (2); + // 1110 xxxx + if ((c & 0xF0) == 0xE0) + return (3); + // 1111 0xxx + if ((c & 0xF8) == 0xF0) + return (4); + return (-1); + } + + /** Decodes UTF8 characters in a string. There is also a built-in way in Java that converts + * UTF8 to characters and back, but it does not work with all characters. */ + public static String decodeUtf8(String string) { + if (string.isEmpty()) + return (string); + StringBuilder result = new StringBuilder(string.length()); + for (int i = 0; i < string.length(); i++) { + int len = Utf8Length(string.charAt(i)); + if (string.length() - i >= len) + switch (len) { + case 2: + if ((string.charAt(i + 1) & 0xC0) != 0x80) + break; + result.append((char) (((string.charAt(i) & 0x1F) << 6) + (string + .charAt(i + 1) & 0x3F))); + i++; + continue; + case 3: + if ((string.charAt(i + 1) & 0xC0) != 0x80 + || (string.charAt(i + 2) & 0xC0) != 0x80) + break; + result.append((char) (((string.charAt(i + 0) & 0x0F) << 12) + + ((string.charAt(i + 1) & 0x3F) << 6) + ((string + .charAt(i + 2) & 0x3F)))); + i += 2; + continue; + case 4: + if ((string.charAt(i + 1) & 0xC0) != 0x80 + || (string.charAt(i + 2) & 0xC0) != 0x80 + || (string.charAt(i + 3) & 0xC0) != 0x80) + break; + result.append((char) (((string.charAt(i + 0) & 0x07) << 18) + + ((string.charAt(i + 1) & 0x3F) << 12) + + ((string.charAt(i + 2) & 0x3F) << 6) + ((string + .charAt(i + 3) & 0x3F)))); + i += 3; + continue; + } + result.append(string.charAt(i)); + } + return (result.toString()); + } + + /** Used for encoding selected characters*/ + public static interface Legal { + public boolean isLegal(char c); + } + + /** Encodes with backslash all illegal characters*/ + public static String encodeBackslash(CharSequence s, Legal legal) { + StringBuilder b = new StringBuilder((int) (s.length() * 1.5)); + for (int i = 0; i < s.length(); i++) { + if (legal.isLegal(s.charAt(i)) && s.charAt(i) != '\\') { + b.append(s.charAt(i)); + } else { + if (charToBackslash.containsKey(s.charAt(i))) { + b.append(charToBackslash.get(s.charAt(i))); + continue; + } + b.append("\\u"); + String hex = Integer.toHexString(s.charAt(i)); + for (int j = 0; j < 4 - hex.length(); j++) + b.append('0'); + b.append(hex); + } + } + return (b.toString()); + } + + /** Decodes a backslash sequence*/ + public static String decodeBackslash(String string) { + int pos = string.indexOf('\\'); + if (pos == -1) + return (string); + StringBuilder result = new StringBuilder(string.length()); + for (int i = 0; i < string.length(); i++) { + if (string.charAt(i) != '\\' || i + 1 >= string.length()) { + result.append(string.charAt(i)); + continue; + } + char nextChar = string.charAt(i + 1); + switch (nextChar) { + case 'u': // Unicodes BS u XXXX + try { + result.append((char) Integer.parseInt( + string.substring(i + 2, i + 6), 16)); + i += 5; + } catch (Exception e) { + // Invalid char + result.append('\\'); + } + continue; + case 'b': + result.append((char) 8); + i++; + continue; + case 't': + result.append((char) 9); + i++; + continue; + case 'n': + result.append((char) 10); + i++; + continue; + case 'f': + result.append((char) 12); + i++; + continue; + case 'r': + result.append((char) 13); + i++; + continue; + case '\\': + result.append('\\'); + i++; + continue; + case '"': + result.append('"'); + i++; + continue; + case '\'': + result.append('\''); + i++; + continue; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': // Octal codes + int end = i + 1; + while (end < string.length() && string.charAt(end) >= '0' + && string.charAt(end) <= '8') + end++; + try { + result.append((char) Integer.parseInt( + string.substring(i + 1, end), 8)); + i = end - 1; + } catch (Exception e) { + // Wrong code + result.append('\\'); + } + continue; + default: + result.append('\\'); + continue; + } + } + return (result.toString()); + } + + /** Replaces all codes in a String by the 16 bit Unicode characters */ + public static String decode(String string) { + return (decodeAmpersand(decodeUtf8(decodeBackslash(decodePercentage(string))))); + } + + /** Encodes a character to UTF8*/ + public static String encodeUTF8(char c) { + if (c <= 0x7F) + return ("" + (char) c); + else if (c <= 0x7FF) + return ("" + (char) (0xC0 + ((c >> 6) & 0x1F)) + (char) (0x80 + (c & 0x3F))); + else + return ("" + (char) (0xE0 + ((c >> 12) & 0x0F)) + + (char) (0x80 + ((c >> 6) & 0x3F)) + (char) (0x80 + (c & 0x3F))); + } + + /** Encodes a string to UTF8*/ + public static String encodeUTF8(String string) { + StringBuilder result = new StringBuilder(string.length() * 12 / 10); + for (int i = 0; i < string.length(); i++) { + char c = string.charAt(i); + result.append(encodeUTF8(c)); + } + return (result.toString()); + } + + /** Encodes non-legal characters to Ampersand codes*/ + public static String encodeAmpersand(String string, Legal legal) { + StringBuilder result = new StringBuilder(string.length() * 12 / 10); + for (int i = 0; i < string.length(); i++) { + char c = string.charAt(i); + if (legal.isLegal(c) && c != '&') { + result.append(c); + continue; + } + String s = charToAmpersand.get(c); + if (s != null) { + result.append(s); + continue; + } + result.append("&#" + ((int) c) + ";"); + } + return (result.toString()); + } + + /** Encodes a string into Percentage codes. Also does UTF8 encoding.*/ + public static String encodePercentage(String string, Legal legal) { + string = encodeUTF8(string); + StringBuilder result = new StringBuilder(string.length() * 12 / 10); + for (int i = 0; i < string.length(); i++) { + char c = string.charAt(i); + if (legal.isLegal(c) && c != '%') { + result.append(c); + continue; + } + if (c < 16) + result.append("%0" + Integer.toHexString(c).toUpperCase()); + else + result.append("%" + Integer.toHexString(c).toUpperCase()); + } + return (result.toString()); + } + + /** + * Encodes a String with reserved XML characters into a valid xml string for attributes. + * @param str + * @return + */ + public static String encodeXmlAttribute(String str) { + if (str == null) + return null; + int len = str.length(); + if (len == 0) + return str; + StringBuffer encoded = new StringBuffer(); + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + if (c == '<') + encoded.append("<"); + else if (c == '\"') + encoded.append("""); + else if (c == '>') + encoded.append(">"); + else if (c == '\'') + encoded.append("'"); + else if (c == '&') + encoded.append("&"); + else + encoded.append(c); + } + return encoded.toString(); + } + + /** Returns an HTML-String of the String */ + public static String toHTML(String s) { + return (Char17.encodeAmpersand(s, alphaNumericAndSpace).replace( + " ", "
")); + } + + /** Replaces illegal characters in the string by hex codes (cannot be undone)*/ + public static String encodeHex(String s, Legal legal) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (legal.isLegal(c)) + result.append(c); + else + result.append(Integer.toHexString(s.charAt(i)).toUpperCase()); + } + return (result.toString()); + } + + /** Tells whether a char is in a range*/ + public static boolean in(char c, char a, char b) { + return (c >= a && c <= b); + } + + /** Tells whether a char is in a string*/ + public static boolean in(char c, String s) { + return (s.indexOf(c) != -1); + } + + /** Tells whether a char is alphanumeric in the sense of URIs*/ + public static boolean isAlphanumeric(char c) { + return (in(c, 'a', 'z') || in(c, 'A', 'Z') || in(c, '0', '9')); + } + + /** Tells whether a char is reserved in the sense of URIs*/ + public static boolean isReserved(char c) { + return (isSubDelim(c) || isGenDelim(c)); + } + + /** Tells whether a char is unreserved in the sense of URIs (not the same as !reserved)*/ + public static boolean isUnreserved(char c) { + return (isAlphanumeric(c) || in(c, "-._~")); + } + + /** Tells whether a string is escaped in the sense of URIs*/ + public static boolean isEscaped(String s) { + return (s.matches("%[0-9A-Fa-f]{2}")); + } + + /** Tells whether a char is a sub-delimiter in the sense of URIs*/ + public static boolean isSubDelim(char c) { + return (in(c, "!$&'()*+,=")); + } + + /** Tells whether a char is a general delimiter in the sense of URIs*/ + public static boolean isGenDelim(char c) { + return (in(c, ":/?#[]@")); + } + + /** Tells whether a char is a valid path component in the sense of URIs*/ + public static boolean isPchar(char c) { + return (isUnreserved(c) || isSubDelim(c) || in(c, "@")); + } + + /** Legal path components in the sense of URIs*/ + public static final Legal uriPathComponent = new Legal() { + public boolean isLegal(char c) { + return isPchar(c); + } + }; + + /** True for ASCII alphanumeric and space*/ + public static final Legal alphaNumericAndSpace = new Legal() { + public boolean isLegal(char c) { + return isAlphanumeric(c) || c == ' '; + } + }; + + /** Encodes a char to percentage code, if it is not a path character in the sense of URIs*/ + public static String encodeURIPathComponent(String s) { + return (encodePercentage(s, uriPathComponent)); + } + + /** TRUE for XML path components*/ + public static final Legal xmlPathComponent = new Legal() { + public boolean isLegal(char c) { + return isPchar(c) && c != '&' && c != '"'; + } + }; + + /** Encodes a char to percentage code, if it is not a path character in the sense of XMLs*/ + public static String encodeURIPathComponentXML(String string) { + return (encodePercentage(string, xmlPathComponent)); + } + + /** Decodes a URI path component*/ + public static String decodeURIPathComponent(String s) { + return (Char17.decodePercentage(s)); + } + + /** Decodes all codes in a String and normalizes all chars */ + public static String decodeAndNormalize(String s) { + return (normalize(decode(s))); + } + + /** Normalizes all chars in a String to characters 0x20-0x7F */ + public static String normalize(String s) { + StringBuilder b = new StringBuilder(); + for (int i = 0; i < s.length(); i++) + b.append(normalize(s.charAt(i))); + return (b.toString()); + } + + /** Returns the last character of a String or 0*/ + public static char last(CharSequence s) { + return (s.length() == 0 ? (char) 0 : s.charAt(s.length() - 1)); + } + + /** Returns the String without the last character */ + public static String cutLast(String s) { + return (s.length() == 0 ? "" : s.substring(0, s.length() - 1)); + } + + /** Cuts the last character */ + public static StringBuilder cutLast(StringBuilder s) { + s.setLength(s.length() - 1); + return (s); + } + + /** Upcases the first character in a String*/ + public static String upCaseFirst(String s) { + if (s == null || s.length() == 0) + return (s); + return (Character.toUpperCase(s.charAt(0)) + s.substring(1)); + } + + /** Lowcases the first character in a String*/ + public static String lowCaseFirst(String s) { + if (s == null || s.length() == 0) + return (s); + return (Character.toLowerCase(s.charAt(0)) + s.substring(1)); + } + + /** Returns a string of the given length, fills with spaces if necessary */ + public static CharSequence truncate(CharSequence s, int len) { + if (s.length() == len) + return (s); + if (s.length() > len) + return (s.subSequence(0, len)); + StringBuilder result = new StringBuilder(s); + while (result.length() < len) + result.append(' '); + return (result); + } + + /** Capitalizes words and lowercases the rest*/ + public static String capitalize(String s) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (i == 0 || i > 0 && !Character.isLetterOrDigit(s.charAt(i - 1))) + c = Character.toUpperCase(c); + else + c = Character.toLowerCase(c); + result.append(c); + } + return (result.toString()); + } + + /** TRUE if the Charsequence ends with the string */ + public static boolean endsWith(CharSequence s, String end) { + return (s.length() >= end.length() && s.subSequence( + s.length() - end.length(), s.length()).equals(end)); + } + + /** IndexOf */ + public static int indexOf(char c, CharSequence string) { + for (int i = 0; i < string.length(); i++) { + if (string.charAt(i) == c) + return (i); + } + return (-1); + } + + /** Prints a test case*/ + public static boolean testCase(String name, String s1, String s2) { + if (s1.equals(s2)) { + System.out.println(name + ": OK"); + return (true); + } else { + System.out.println(name + ": " + s1 + "!=" + s2); + return (false); + } + } + + /** Tests all methods*/ + public static void test() { + String in, out; + testCase("EncAmp", + encodeAmpersand(in = "a+A&b�", alphaNumericAndSpace), + out = "a+A&bä"); + testCase("DecAmp", decodeAmpersand(out), in); + testCase("DecAmp'", decodeAmpersand("&&inv;ä& &"), + "&&inv;�& &"); + testCase("EncBack", + encodeBackslash(in = "a+A\\b�\n", alphaNumericAndSpace), + out = "a\\u002bA\\\\b\\u00e4\\n"); + testCase("DecBack", decodeBackslash(out), in); + testCase("DecBack'", decodeBackslash("\\00101a\\\\a\\n\\k\\"), + "Aa\\a\n\\k\\"); + testCase("EncHex", encodeHex("a+A\\b�", alphaNumericAndSpace), + "a2BA5CbE4"); + testCase("EncPerc", + encodePercentage(in = "a+A%b�", alphaNumericAndSpace), + out = "a%2BA%25b%C3%A4"); + testCase("DecPerc", decodePercentage(out), in); + testCase("DecPerc'", decodePercentage("%C3%A4%X%"), "�%X%"); + testCase("EncURI", encodeURIPathComponent(in = "a+A/b �&"), + out = "a+A%2Fb%20%C3%A4&"); + testCase("DecURI", decodeURIPathComponent(out), in); + testCase("EncXML", encodeURIPathComponentXML("a+A/b �&"), + "a+A%2Fb%20%C3%A4%26"); + testCase("EncUTF8", encodeUTF8("a+A/b �<"), "a+A/b ä<"); + } + + /** Test routine */ + public static void main(String argv[]) throws Exception { + test(); + } + +}