001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.jexl2.parser;
018    
019    /**
020     * Common constant strings utilities.
021     * <p>
022     * This package methods read JEXL string literals and handle escaping through the
023     * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
024     * and double quotes) and read Unicode hexadecimal encoded characters.
025     * </p>
026     * <p>
027     * The only escapable characters are the single and double quotes - ''' and '"' -,
028     * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
029     * the backslash character - '\' - itself.
030     * </p>
031     * <p>
032     * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
033     * sequence output being the same as the input.
034     * </p>
035     */
036    public class StringParser {
037        /** Default constructor.  */
038        public StringParser() {}
039        
040        /**
041         * Builds a string, handles escaping through '\' syntax.
042         * @param str the string to build from
043         * @param eatsep whether the separator, the first character, should be considered
044         * @return the built string
045         */
046        public static String buildString(CharSequence str, boolean eatsep) {
047            StringBuilder strb = new StringBuilder(str.length());
048            char sep = eatsep ? str.charAt(0) : 0;
049            int end = str.length() - (eatsep ? 1 : 0);
050            int begin = (eatsep ? 1 : 0);
051            read(strb, str, begin, end, sep);
052            return strb.toString();
053        }
054    
055        /**
056         * Read the remainder of a string till a given separator,
057         * handles escaping through '\' syntax.
058         * @param strb the destination buffer to copy characters into
059         * @param str the origin
060         * @param index the offset into the origin
061         * @param sep the separator, single or double quote, marking end of string
062         * @return the offset in origin
063         */
064        public static int readString(StringBuilder strb, CharSequence str, int index, char sep) {
065            return read(strb, str, index, str.length(), sep);
066        }
067    
068        /** The length of an escaped unicode sequence. */
069        private static final int UCHAR_LEN = 4;
070    
071        /**
072         * Read the remainder of a string till a given separator,
073         * handles escaping through '\' syntax.
074         * @param strb the destination buffer to copy characters into
075         * @param str the origin
076         * @param begin the relative offset in str to begin reading
077         * @param end the relative offset in str to end reading
078         * @param sep the separator, single or double quote, marking end of string
079         * @return the last character offset handled in origin
080         */
081        private static int read(StringBuilder strb, CharSequence str, int begin, int end, char sep) {
082            boolean escape = false;
083            int index = begin;
084            for (; index < end; ++index) {
085                char c = str.charAt(index);
086                if (escape) {
087                    if (c == 'u' && (index + UCHAR_LEN) < end && readUnicodeChar(strb, str, index + 1) > 0) {
088                        index += UCHAR_LEN;
089                    } else {
090                        // if c is not an escapable character, re-emmit the backslash before it
091                        boolean notSeparator = sep == 0? c != '\'' && c != '"' : c != sep;
092                        if (notSeparator && c != '\\') {
093                            strb.append('\\');
094                        }
095                        strb.append(c);
096                    }
097                    escape = false;
098                    continue;
099                }
100                if (c == '\\') {
101                    escape = true;
102                    continue;
103                }
104                strb.append(c);
105                if (c == sep) {
106                    break;
107                }
108            }
109            return index;
110        }
111    
112        /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
113        private static final int SHIFT = 12;
114        /** The base 10 offset used to convert hexa characters to decimal. */
115        private static final int BASE10 = 10;
116        /**
117         * Reads a Unicode escape character.
118         * @param strb the builder to write the character to
119         * @param str the sequence
120         * @param begin the begin offset in sequence (after the '\\u')
121         * @return 0 if char could not be read, 4 otherwise
122         */
123        private static int readUnicodeChar(StringBuilder strb, CharSequence str, int begin) {
124            char xc = 0;
125            int bits = SHIFT;
126            int value = 0;
127            for(int offset = 0; offset < UCHAR_LEN; ++offset) {
128                char c = str.charAt(begin + offset);
129                if (c >= '0' && c <= '9') {
130                    value = (c - '0');
131                } else if (c >= 'a' && c <= 'h') {
132                   value = (c - 'a' + BASE10);
133                } else if (c >= 'A' && c <= 'H') {
134                    value = (c - 'A' + BASE10);
135                } else {
136                    return 0;
137                }
138                xc |= value << bits;
139                bits -= UCHAR_LEN;
140            }
141            strb.append(xc);
142            return UCHAR_LEN;
143        }
144        
145        /** The last 7bits ascii character. */
146        private static final char LAST_ASCII = 127;
147    
148        /**
149         * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
150         * @param str the string to escape
151         * @return the escaped representation
152         */
153        public static String escapeString(String str) {
154            if (str == null) {
155                return null;
156            }
157            final int length = str.length();
158            StringBuilder strb = new StringBuilder(length + 2);
159            strb.append('\'');
160            for (int i = 0; i < length; ++i) {
161                char c = str.charAt(i);
162                if (c < LAST_ASCII) {
163                    if (c == '\'') {
164                        // escape quote
165                        strb.append('\\');
166                        strb.append('\'');
167                    } else if (c == '\\') {
168                        // escape backslash
169                        strb.append('\\');
170                        strb.append('\\');
171                    } else {
172                        strb.append(c);
173                    }
174                } else {
175                    // convert to Unicode escape sequence
176                    strb.append('\\');
177                    strb.append('u');
178                    String hex = Integer.toHexString(c);
179                    for (int h = hex.length(); h < UCHAR_LEN; ++h) {
180                        strb.append('0');
181                    }
182                    strb.append(hex);
183                }
184            }
185            strb.append('\'');
186            return strb.toString();
187        }
188    }