001    /*
002     * LAPIS lightweight structured text processing system
003     *
004     * Copyright (C) 1998-2002 Carnegie Mellon University,
005     * Copyright (C) 2003 Massachusetts Institute of Technology.
006     * All rights reserved.
007     *
008     * This library is free software; you can redistribute it
009     * and/or modify it under the terms of the GNU General
010     * Public License as published by the Free Software
011     * Foundation, version 2.
012     *
013     * LAPIS homepage: http://graphics.lcs.mit.edu/lapis/
014     */
015    
016    
017    package lapisx.util;
018    
019    import java.util.StringTokenizer;
020    
021    /**
022     * String utility routines.
023     */
024    public abstract class Str {
025    
026        /**
027         * Find first occurence of any of a set of characters.
028         * @param subject String in which to search
029         * @param chars Characters to search for
030         * @return index of first occurence in subject of a character from chars,
031         * or -1 if no match.
032         */
033        public static int indexOfAnyChar (String subject, String chars) {
034            return indexOfAnyChar (subject, chars, 0);
035        }
036    
037        /**
038         * Find first occurence of any of a set of characters, starting
039         * at a specified index.
040         * @param subject String in which to search
041         * @param chars Characters to search for
042         * @param start Starting offset to search from
043         * @return index of first occurence (after start) in subject of a character from chars,
044         * or -1 if no match.
045         */
046        public static int indexOfAnyChar (String subject, String chars, int start) {
047            for (int i=start; i<subject.length(); ++i)
048                if (chars.indexOf (subject.charAt (i)) != -1)
049                    return i;
050            return -1;
051        }
052    
053        /**
054         * Replace all occurences of a string.
055         * @param subject String in which to search
056         * @param original String to search for in subject
057         * @param replacement String to substitute
058         * @return subject with all occurences of original replaced by replacement
059         */
060        public static String replace (String subject, String original, String replacement) {
061            StringBuffer output = new StringBuffer ();
062    
063            int p = 0;
064            int i;
065            while ((i = subject.indexOf (original, p)) != -1) {
066                output.append (subject.substring (p, i));
067                output.append (replacement);
068                p = i + original.length();
069            }
070            if (p < subject.length ())
071                output.append (subject.substring(p));
072            return output.toString ();
073        }
074    
075        /**
076         * Escapes metacharacters in a string.
077         * @param subject String in which metacharacters are to be escaped
078         * @param escapeChar the escape character (e.g., \)
079         * @param metachars the metacharacters that should be escaped
080         * @return subject with escapeChar inserted before every character found in metachars
081         */
082        public static String escape (String subject, char escapeChar, String metachars) {
083            return escape (subject, metachars, escapeChar, metachars);
084        }
085    
086        /**
087         * Escapes characters in a string.
088         * @param subject String in which metacharacters are to be escaped
089         * @param chars Characters that need to be escaped (e.g. "\b\t\r\n\\")
090         * @param escapeChar the escape character (e.g., '\\')
091         * @param metachars escape code letters corresponding to each letter in chars (e.g. "btrn\\")
092         *    <B>Must have metachars.length () == chars.length().</B>
093         * @return subject where every occurence of c in chars is replaced
094         * by escapeChar followed the character corresponding to c in metachars.
095         *
096         */
097        public static String escape (String subject, String chars, char escapeChar, String metachars) {
098            StringBuffer output = new StringBuffer ();
099    
100            int p = 0;
101            int i;
102            while ((i = indexOfAnyChar (subject, chars, p)) != -1) {
103                output.append (subject.substring (p, i));
104    
105                char c = subject.charAt (i); // character that needs escaping
106                int k = chars.indexOf (c);
107                char metac = metachars.charAt (k);   // its corresponding metachar
108                output.append (escapeChar);
109                output.append (metac);
110    
111                p = i + 1;
112            }
113            if (p < subject.length ())
114                output.append (subject.substring(p));
115            return output.toString ();
116        }
117    
118        /**
119         * Translate escape sequences (e.g. \r, \n) to characters.
120         * @param subject String in which metacharacters are to be escaped
121         * @param escapeChar the escape character (e.g., \)
122         * @param metachars letters representing escape codes (typically "btrn\\")
123         * @param chars characters corresponding to metachars (typically "\b\t\r\n\\").
124         *    <B>Must have chars.length () == metachars.length().</B>
125         * @param keepUntranslatedEscapes Controls behavior on unknown escape sequences
126         * (see below).
127         * @return subject where every escapeChar followed by c in metachars
128         * is replaced by the character corresponding to c in chars.  If an escape
129         * sequence is untranslatable (because escapeChar is followed by some character c
130         * not in metachars), then the escapeChar is kept if keepUntranslatedEscapes is true,
131         * otherwise the escapeChar is deleted. (The character c is always kept.)
132         *
133         */
134        public static String unescape (String subject, char escapeChar, String metachars, String chars, boolean keepUntranslatedEscapes) {
135            StringBuffer output = new StringBuffer ();
136    
137            int p = 0;
138            int i;
139            int len = subject.length ();
140            while ((i = subject.indexOf (escapeChar, p)) != -1) {
141                output.append (subject.substring (p, i));
142                if (i + 1 == len)
143                    break;
144    
145                char metac = subject.charAt (i+1);  // metachar to replace
146                int k = metachars.indexOf (metac);
147                if (k == -1) {
148                    // untranslatable sequence
149                    if (keepUntranslatedEscapes)
150                        output.append (escapeChar);
151                    output.append (metac);
152                }
153                else
154                    output.append (chars.charAt (k));   // its corresponding true char
155    
156                p = i + 2;    // skip over both escapeChar & metac
157            }
158    
159            if (p < len)
160                output.append (subject.substring(p));
161            return output.toString ();
162        }
163    
164        /**
165         * Parse a number from a string. Finds the first recognizable base-10 number (integer or floating point)
166         * in the string and returns it as a Number.  Uses American English conventions
167         * (i.e., '.' as decimal point and ',' as thousands separator).
168         * @param string String to parse
169         * @return first recognizable number
170         * @exception NumberFormatException if no recognizable number is found
171         */
172        private static final int INT = 0;
173        private static final int FRAC = 1;
174        private static final int EXP = 2;
175        public static Number parseNumber (String s) throws NumberFormatException {
176            int p = 0;
177            for (int i=0; i<s.length(); ++i) {
178                char c = s.charAt (i);
179                if (Character.isDigit (c)) {
180                    int start = i;
181                    int end = ++i;
182                    int state = INT;
183    
184                    if (start > 0 && s.charAt (start-1) == '.') {
185                        --start;
186                        state = FRAC;
187                    }
188                    if (start > 0 && s.charAt (start-1) == '-')
189                        --start;
190    
191                  foundEnd:
192                    while (i < s.length()) {
193                        switch (s.charAt (i)) {
194                          case '0': case '1': case '2': case '3': case '4':
195                          case '5': case '6': case '7': case '8': case '9':
196                            end = ++i;
197                            break;
198                          case '.':
199                            if (state != INT)
200                                break foundEnd;
201                            state = FRAC;
202                            ++i;
203                            break;
204                          case ',': // ignore commas
205                            ++i;
206                            break;
207                          case 'e':
208                          case 'E':
209                            state = EXP;
210                            ++i;
211                            if (i < s.length() &&
212                                ( (c = s.charAt (i)) == '+' || c == '-') )
213                              ++i;
214                            break;
215                          default:
216                            break foundEnd;
217                        }
218                    }
219    
220                    String num = s.substring (start, end);
221                    num = replace (num, ",", "");
222                    try {
223                        if (state == INT)
224                            return new Integer (num);
225                        else
226                            return new Float (num);
227                    } catch (NumberFormatException e) {
228                        throw new RuntimeException ("internal error: " + e);
229                    }
230                }
231            }
232            throw new NumberFormatException (s);
233        }
234    /*
235        For testing parseNumber
236    
237        public static void main (String[] args) {
238          for (int i=0; i<args.length; ++i)
239              System.out.println (parseNumber (args[i]));
240      }
241    */
242    
243    
244        /**
245         * Generate a string by concatenating n copies of another string.
246         * @param s String to repeat
247         * @param n number of times to repeat s
248         * @return s concatenated with itself n times
249         */
250        public static String repeat (String s, int n) {
251            StringBuffer out = new StringBuffer ();
252            while (--n >= 0)
253                out.append (s);
254            return out.toString ();
255        }
256    
257        /**
258         * Compress whitespace.
259         * @param s String to compress
260         * @return string with leading and trailing whitespace removed, and
261         * internal runs of whitespace replaced by a single space character
262         */
263        public static String compressWhitespace (String s) {
264            StringBuffer output = new StringBuffer ();
265            int p = 0;
266            boolean inSpace = true;
267            for (int i = 0, len = s.length (); i < len; ++i) {
268                if (Character.isWhitespace (s.charAt (i))) {
269                    if (!inSpace) {
270                        output.append (s.substring (p, i));
271                        output.append (' ');
272                        inSpace = true;
273                    }
274                }
275                else {
276                    if (inSpace) {
277                        p = i;
278                        inSpace = false;
279                    }
280                }
281            }
282            if (!inSpace)
283                output.append (s.substring (p));
284            return output.toString ();
285        }
286    
287        /**
288         * Test if string contains only whitespace.
289         * @param s String to test
290         * @return true iff all characters in s satisfy Character.isWhitespace().
291         * If s is empty, returns true.
292         */
293        public static boolean isWhitespace (String s) {
294            for (int i = 0, n = s.length (); i < n; ++i)
295                if (!Character.isWhitespace (s.charAt (i)))
296                    return false;
297            return true;
298        }
299    
300        /**
301         * Concatenate an array of strings.
302         * @param list Array of strings to concatenate
303         * @param sep Separator to insert between each string
304         * @return string consisting of list[0] + sep + list[1] + sep + ... + sep + list[list.length-1]
305         */
306        public static String join (String[] list, String sep) {
307            StringBuffer result = new StringBuffer ();
308            for (int i=0; i < list.length; ++i) {
309                if (i > 0)
310                    result.append (sep);
311                result.append (list[i]);
312            }
313            return result.toString ();
314        }
315    
316        /**
317         * Abbreviate a string.
318         * @param s String to abbreviate
319         * @param max Maximum length of returned string; must be at least 5
320         *  s with linebreaks removed and enough characters removed from
321         * the middle (replaced by "...") to make length <= max
322         */
323        public static String abbreviate (String s, int max) {
324            s = compressWhitespace (s);
325            if (s.length() < max)
326                return s;
327            else {
328                max = Math.max (max-3, 2);   // for "..."
329                int half = max/2;
330                return s.substring (0, half) + "..." + s.substring (s.length()-half);
331            }
332        }
333    
334        /**
335         * Abbreviate a multi-line string.
336         * @param s String to abbreviate
337         * @param maxLines Max number of lines in returned string; must be at least 3
338         * @param message Message to replace removed lines with; should end with
339         * \n, but may be multiple lines.  Occurrences of %d are replaced with
340         * the number of lines removed.
341         *  s with enough whole lines removed from
342         * the middle (replaced by message) to make its length in lines <= max
343         */
344        public static String abbreviateLines (String s, int maxLines, String message) {
345            int nLines = countLines (s);
346            if (nLines < maxLines)
347                return s;
348            else {
349                maxLines = Math.max (maxLines-1, 2);   // take out one line for "..."
350                int half = maxLines/2;
351                return s.substring (0, nthLine (s, half)) 
352                    + replace (message, "%d", String.valueOf (nLines - half*2))
353                    + s.substring (nthLine (s, -half));
354            }
355        }
356    
357        static int countLines (String s) {
358            int n = 1;
359            int i = -1;
360            while ((i = s.indexOf ('\n', i+1)) != -1)
361                ++n;
362            return n;
363        }
364        static int nthLine (String s, int n) {
365            if (n >= 0) {
366                int i = -1;
367                while (n > 0 && (i = s.indexOf ('\n', i+1)) != -1)
368                    --n;
369                return i+1;
370            } else {
371                int i = s.length ();
372                while (n < 0 && (i = s.lastIndexOf ('\n', i-1)) != -1)
373                    ++n;
374                return i+1;
375            }
376        }
377    
378        /**
379          * Split string around a substring match and return prefix.
380          * @param s String to split
381          * @param pat Substring to search for in s
382          * @return Prefix of s ending just before the first occurrence
383          * of pat.  If pat is not found in s, returns s itself.
384          */
385        public static String before (String s, String pat) {
386            int i = s.indexOf (pat);
387            return (i >= 0) ? s.substring(0, i) : s;
388        }
389    
390        /**
391          * Split string around a substring match and return suffix.
392          * @param s String to split
393          * @param pat Substring to search for in s
394          * @return Suffix of s starting just after the first occurrence
395          * of pat.  If pat is not found in s, returns "".
396          */
397        public static String after (String s, String pat) {
398            int i = s.indexOf (pat);
399            return (i >= 0) ? s.substring(i + pat.length ()) : "";
400        }
401    
402    
403        /**
404          * Like String.startsWith, but case-insensitive.
405          */
406        public static boolean startsWithIgnoreCase (String s, String prefix) {
407            int sLen = s.length ();
408            int prefixLen = prefix.length ();
409            return (sLen >= prefixLen
410                    && s.substring (0, prefixLen).equalsIgnoreCase (prefix));
411        }
412    
413        /**
414          * Like String.endsWith, but case-insensitive.
415          */
416        public static boolean endsWithIgnoreCase (String s, String suffix) {
417            int sLen = s.length ();
418            int suffixLen = suffix.length ();
419            return (sLen >= suffixLen
420                    && s.substring (sLen - suffixLen).equalsIgnoreCase (suffix));
421        }
422    
423        /**
424          * Expands tabs to spaces.
425          */
426        public static String untabify (String s, int tabsize) {
427            if (s.indexOf ('\t') == -1)
428                return s; // no tabs, don't bother
429    
430            int col = 0;
431            StringBuffer result = new StringBuffer ();
432            for (StringTokenizer tokenizer = new StringTokenizer (s, "\t\r\n", true);
433                 tokenizer.hasMoreTokens (); ) {
434                String tok = tokenizer.nextToken ();
435                switch (tok.charAt (0)) {
436                case '\t':
437                    {
438                        int oldcol = col;
439                        col = (col/tabsize + 1) * tabsize;
440                        result.append (Str.repeat (" ", col - oldcol));
441                    }
442                    break;
443                case '\r':
444                case '\n':
445                    col = 0;
446                    result.append (tok);
447                    break;
448                default:
449                    col += tok.length ();
450                    result.append (tok);
451                    break;
452                }
453            }
454    
455            return result.toString ();
456        }
457    
458        /**
459         * Reverse a string.
460         * @param s String to reverse
461         * @return string containing characters of s in reverse order
462         */
463        public static String reverse (String s) {
464            StringBuffer t = new StringBuffer (s.length ());
465            for (int i = s.length () - 1; i >= 0; --i)
466                t.append (s.charAt(i));
467            return t.toString ();
468        }
469    
470        /**
471         * Find longest common prefix of two strings.
472         */
473        public static String longestCommonPrefix (String s, String t) {
474            return s.substring (0, longestCommonPrefixLength (s, t));
475        }
476    
477        public static int longestCommonPrefixLength (String s, String t) {
478            int m = Math.min (s.length (), t.length());
479            for (int k = 0; k < m; ++k)
480                if (s.charAt (k) != t.charAt (k))
481                    return k;
482            return m;
483        }
484    
485        /**
486         * Find longest common suffix of two strings.
487         */
488        public static String longestCommonSuffix (String s, String t) {
489            return s.substring (s.length () - longestCommonSuffixLength (s, t));
490        }
491    
492        public static int longestCommonSuffixLength (String s, String t) {
493            int i = s.length ()-1;
494            int j = t.length ()-1;
495            for (; i >= 0 && j >= 0; --i, --j)
496                if (s.charAt (i) != t.charAt (j))
497                    return s.length () - (i+1);
498            return s.length () - (i+1);
499        }
500    
501    
502    
503    
504        /**
505         * Find longest common prefix of two strings, ignoring case.
506         */
507        public static String longestCommonPrefixIgnoreCase (String s, String t) {
508            return s.substring (0, longestCommonPrefixLengthIgnoreCase (s, t));
509        }
510    
511        public static int longestCommonPrefixLengthIgnoreCase (String s, String t) {
512            int m = Math.min (s.length (), t.length());
513            for (int k = 0; k < m; ++k)
514                if (Character.toLowerCase (s.charAt (k)) != Character.toLowerCase (t.charAt (k)))
515                    return k;
516            return m;
517        }
518    
519        /**
520         * Find longest common suffix of two strings, ignoring case.
521         */
522        public static String longestCommonSuffixIgnoreCase (String s, String t) {
523            return s.substring (s.length () - longestCommonSuffixLengthIgnoreCase (s, t));
524        }
525    
526        public static int longestCommonSuffixLengthIgnoreCase (String s, String t) {
527            int i = s.length ()-1;
528            int j = t.length ()-1;
529            for (; i >= 0 && j >= 0; --i, --j)
530                if (Character.toLowerCase (s.charAt (i)) != Character.toLowerCase (t.charAt (j)))
531                    return s.length () - (i+1);
532            return s.length () - (i+1);
533        }
534    }