001 /*
002 * LAPIS lightweight structured text processing system
003 *
004 * Copyright (C) 1998-2002 Carnegie Mellon University,
005 * Copyright (C) 2003 Massachusetts Institute of Technology.
006 * All rights reserved.
007 *
008 * This library is free software; you can redistribute it
009 * and/or modify it under the terms of the GNU General
010 * Public License as published by the Free Software
011 * Foundation, version 2.
012 *
013 * LAPIS homepage: http://graphics.lcs.mit.edu/lapis/
014 */
015
016
017 package lapisx.util;
018
019 import java.util.StringTokenizer;
020
021 /**
022 * String utility routines.
023 */
024 public abstract class Str {
025
026 /**
027 * Find first occurence of any of a set of characters.
028 * @param subject String in which to search
029 * @param chars Characters to search for
030 * @return index of first occurence in subject of a character from chars,
031 * or -1 if no match.
032 */
033 public static int indexOfAnyChar (String subject, String chars) {
034 return indexOfAnyChar (subject, chars, 0);
035 }
036
037 /**
038 * Find first occurence of any of a set of characters, starting
039 * at a specified index.
040 * @param subject String in which to search
041 * @param chars Characters to search for
042 * @param start Starting offset to search from
043 * @return index of first occurence (after start) in subject of a character from chars,
044 * or -1 if no match.
045 */
046 public static int indexOfAnyChar (String subject, String chars, int start) {
047 for (int i=start; i<subject.length(); ++i)
048 if (chars.indexOf (subject.charAt (i)) != -1)
049 return i;
050 return -1;
051 }
052
053 /**
054 * Replace all occurences of a string.
055 * @param subject String in which to search
056 * @param original String to search for in subject
057 * @param replacement String to substitute
058 * @return subject with all occurences of original replaced by replacement
059 */
060 public static String replace (String subject, String original, String replacement) {
061 StringBuffer output = new StringBuffer ();
062
063 int p = 0;
064 int i;
065 while ((i = subject.indexOf (original, p)) != -1) {
066 output.append (subject.substring (p, i));
067 output.append (replacement);
068 p = i + original.length();
069 }
070 if (p < subject.length ())
071 output.append (subject.substring(p));
072 return output.toString ();
073 }
074
075 /**
076 * Escapes metacharacters in a string.
077 * @param subject String in which metacharacters are to be escaped
078 * @param escapeChar the escape character (e.g., \)
079 * @param metachars the metacharacters that should be escaped
080 * @return subject with escapeChar inserted before every character found in metachars
081 */
082 public static String escape (String subject, char escapeChar, String metachars) {
083 return escape (subject, metachars, escapeChar, metachars);
084 }
085
086 /**
087 * Escapes characters in a string.
088 * @param subject String in which metacharacters are to be escaped
089 * @param chars Characters that need to be escaped (e.g. "\b\t\r\n\\")
090 * @param escapeChar the escape character (e.g., '\\')
091 * @param metachars escape code letters corresponding to each letter in chars (e.g. "btrn\\")
092 * <B>Must have metachars.length () == chars.length().</B>
093 * @return subject where every occurence of c in chars is replaced
094 * by escapeChar followed the character corresponding to c in metachars.
095 *
096 */
097 public static String escape (String subject, String chars, char escapeChar, String metachars) {
098 StringBuffer output = new StringBuffer ();
099
100 int p = 0;
101 int i;
102 while ((i = indexOfAnyChar (subject, chars, p)) != -1) {
103 output.append (subject.substring (p, i));
104
105 char c = subject.charAt (i); // character that needs escaping
106 int k = chars.indexOf (c);
107 char metac = metachars.charAt (k); // its corresponding metachar
108 output.append (escapeChar);
109 output.append (metac);
110
111 p = i + 1;
112 }
113 if (p < subject.length ())
114 output.append (subject.substring(p));
115 return output.toString ();
116 }
117
118 /**
119 * Translate escape sequences (e.g. \r, \n) to characters.
120 * @param subject String in which metacharacters are to be escaped
121 * @param escapeChar the escape character (e.g., \)
122 * @param metachars letters representing escape codes (typically "btrn\\")
123 * @param chars characters corresponding to metachars (typically "\b\t\r\n\\").
124 * <B>Must have chars.length () == metachars.length().</B>
125 * @param keepUntranslatedEscapes Controls behavior on unknown escape sequences
126 * (see below).
127 * @return subject where every escapeChar followed by c in metachars
128 * is replaced by the character corresponding to c in chars. If an escape
129 * sequence is untranslatable (because escapeChar is followed by some character c
130 * not in metachars), then the escapeChar is kept if keepUntranslatedEscapes is true,
131 * otherwise the escapeChar is deleted. (The character c is always kept.)
132 *
133 */
134 public static String unescape (String subject, char escapeChar, String metachars, String chars, boolean keepUntranslatedEscapes) {
135 StringBuffer output = new StringBuffer ();
136
137 int p = 0;
138 int i;
139 int len = subject.length ();
140 while ((i = subject.indexOf (escapeChar, p)) != -1) {
141 output.append (subject.substring (p, i));
142 if (i + 1 == len)
143 break;
144
145 char metac = subject.charAt (i+1); // metachar to replace
146 int k = metachars.indexOf (metac);
147 if (k == -1) {
148 // untranslatable sequence
149 if (keepUntranslatedEscapes)
150 output.append (escapeChar);
151 output.append (metac);
152 }
153 else
154 output.append (chars.charAt (k)); // its corresponding true char
155
156 p = i + 2; // skip over both escapeChar & metac
157 }
158
159 if (p < len)
160 output.append (subject.substring(p));
161 return output.toString ();
162 }
163
164 /**
165 * Parse a number from a string. Finds the first recognizable base-10 number (integer or floating point)
166 * in the string and returns it as a Number. Uses American English conventions
167 * (i.e., '.' as decimal point and ',' as thousands separator).
168 * @param string String to parse
169 * @return first recognizable number
170 * @exception NumberFormatException if no recognizable number is found
171 */
172 private static final int INT = 0;
173 private static final int FRAC = 1;
174 private static final int EXP = 2;
175 public static Number parseNumber (String s) throws NumberFormatException {
176 int p = 0;
177 for (int i=0; i<s.length(); ++i) {
178 char c = s.charAt (i);
179 if (Character.isDigit (c)) {
180 int start = i;
181 int end = ++i;
182 int state = INT;
183
184 if (start > 0 && s.charAt (start-1) == '.') {
185 --start;
186 state = FRAC;
187 }
188 if (start > 0 && s.charAt (start-1) == '-')
189 --start;
190
191 foundEnd:
192 while (i < s.length()) {
193 switch (s.charAt (i)) {
194 case '0': case '1': case '2': case '3': case '4':
195 case '5': case '6': case '7': case '8': case '9':
196 end = ++i;
197 break;
198 case '.':
199 if (state != INT)
200 break foundEnd;
201 state = FRAC;
202 ++i;
203 break;
204 case ',': // ignore commas
205 ++i;
206 break;
207 case 'e':
208 case 'E':
209 state = EXP;
210 ++i;
211 if (i < s.length() &&
212 ( (c = s.charAt (i)) == '+' || c == '-') )
213 ++i;
214 break;
215 default:
216 break foundEnd;
217 }
218 }
219
220 String num = s.substring (start, end);
221 num = replace (num, ",", "");
222 try {
223 if (state == INT)
224 return new Integer (num);
225 else
226 return new Float (num);
227 } catch (NumberFormatException e) {
228 throw new RuntimeException ("internal error: " + e);
229 }
230 }
231 }
232 throw new NumberFormatException (s);
233 }
234 /*
235 For testing parseNumber
236
237 public static void main (String[] args) {
238 for (int i=0; i<args.length; ++i)
239 System.out.println (parseNumber (args[i]));
240 }
241 */
242
243
244 /**
245 * Generate a string by concatenating n copies of another string.
246 * @param s String to repeat
247 * @param n number of times to repeat s
248 * @return s concatenated with itself n times
249 */
250 public static String repeat (String s, int n) {
251 StringBuffer out = new StringBuffer ();
252 while (--n >= 0)
253 out.append (s);
254 return out.toString ();
255 }
256
257 /**
258 * Compress whitespace.
259 * @param s String to compress
260 * @return string with leading and trailing whitespace removed, and
261 * internal runs of whitespace replaced by a single space character
262 */
263 public static String compressWhitespace (String s) {
264 StringBuffer output = new StringBuffer ();
265 int p = 0;
266 boolean inSpace = true;
267 for (int i = 0, len = s.length (); i < len; ++i) {
268 if (Character.isWhitespace (s.charAt (i))) {
269 if (!inSpace) {
270 output.append (s.substring (p, i));
271 output.append (' ');
272 inSpace = true;
273 }
274 }
275 else {
276 if (inSpace) {
277 p = i;
278 inSpace = false;
279 }
280 }
281 }
282 if (!inSpace)
283 output.append (s.substring (p));
284 return output.toString ();
285 }
286
287 /**
288 * Test if string contains only whitespace.
289 * @param s String to test
290 * @return true iff all characters in s satisfy Character.isWhitespace().
291 * If s is empty, returns true.
292 */
293 public static boolean isWhitespace (String s) {
294 for (int i = 0, n = s.length (); i < n; ++i)
295 if (!Character.isWhitespace (s.charAt (i)))
296 return false;
297 return true;
298 }
299
300 /**
301 * Concatenate an array of strings.
302 * @param list Array of strings to concatenate
303 * @param sep Separator to insert between each string
304 * @return string consisting of list[0] + sep + list[1] + sep + ... + sep + list[list.length-1]
305 */
306 public static String join (String[] list, String sep) {
307 StringBuffer result = new StringBuffer ();
308 for (int i=0; i < list.length; ++i) {
309 if (i > 0)
310 result.append (sep);
311 result.append (list[i]);
312 }
313 return result.toString ();
314 }
315
316 /**
317 * Abbreviate a string.
318 * @param s String to abbreviate
319 * @param max Maximum length of returned string; must be at least 5
320 * s with linebreaks removed and enough characters removed from
321 * the middle (replaced by "...") to make length <= max
322 */
323 public static String abbreviate (String s, int max) {
324 s = compressWhitespace (s);
325 if (s.length() < max)
326 return s;
327 else {
328 max = Math.max (max-3, 2); // for "..."
329 int half = max/2;
330 return s.substring (0, half) + "..." + s.substring (s.length()-half);
331 }
332 }
333
334 /**
335 * Abbreviate a multi-line string.
336 * @param s String to abbreviate
337 * @param maxLines Max number of lines in returned string; must be at least 3
338 * @param message Message to replace removed lines with; should end with
339 * \n, but may be multiple lines. Occurrences of %d are replaced with
340 * the number of lines removed.
341 * s with enough whole lines removed from
342 * the middle (replaced by message) to make its length in lines <= max
343 */
344 public static String abbreviateLines (String s, int maxLines, String message) {
345 int nLines = countLines (s);
346 if (nLines < maxLines)
347 return s;
348 else {
349 maxLines = Math.max (maxLines-1, 2); // take out one line for "..."
350 int half = maxLines/2;
351 return s.substring (0, nthLine (s, half))
352 + replace (message, "%d", String.valueOf (nLines - half*2))
353 + s.substring (nthLine (s, -half));
354 }
355 }
356
357 static int countLines (String s) {
358 int n = 1;
359 int i = -1;
360 while ((i = s.indexOf ('\n', i+1)) != -1)
361 ++n;
362 return n;
363 }
364 static int nthLine (String s, int n) {
365 if (n >= 0) {
366 int i = -1;
367 while (n > 0 && (i = s.indexOf ('\n', i+1)) != -1)
368 --n;
369 return i+1;
370 } else {
371 int i = s.length ();
372 while (n < 0 && (i = s.lastIndexOf ('\n', i-1)) != -1)
373 ++n;
374 return i+1;
375 }
376 }
377
378 /**
379 * Split string around a substring match and return prefix.
380 * @param s String to split
381 * @param pat Substring to search for in s
382 * @return Prefix of s ending just before the first occurrence
383 * of pat. If pat is not found in s, returns s itself.
384 */
385 public static String before (String s, String pat) {
386 int i = s.indexOf (pat);
387 return (i >= 0) ? s.substring(0, i) : s;
388 }
389
390 /**
391 * Split string around a substring match and return suffix.
392 * @param s String to split
393 * @param pat Substring to search for in s
394 * @return Suffix of s starting just after the first occurrence
395 * of pat. If pat is not found in s, returns "".
396 */
397 public static String after (String s, String pat) {
398 int i = s.indexOf (pat);
399 return (i >= 0) ? s.substring(i + pat.length ()) : "";
400 }
401
402
403 /**
404 * Like String.startsWith, but case-insensitive.
405 */
406 public static boolean startsWithIgnoreCase (String s, String prefix) {
407 int sLen = s.length ();
408 int prefixLen = prefix.length ();
409 return (sLen >= prefixLen
410 && s.substring (0, prefixLen).equalsIgnoreCase (prefix));
411 }
412
413 /**
414 * Like String.endsWith, but case-insensitive.
415 */
416 public static boolean endsWithIgnoreCase (String s, String suffix) {
417 int sLen = s.length ();
418 int suffixLen = suffix.length ();
419 return (sLen >= suffixLen
420 && s.substring (sLen - suffixLen).equalsIgnoreCase (suffix));
421 }
422
423 /**
424 * Expands tabs to spaces.
425 */
426 public static String untabify (String s, int tabsize) {
427 if (s.indexOf ('\t') == -1)
428 return s; // no tabs, don't bother
429
430 int col = 0;
431 StringBuffer result = new StringBuffer ();
432 for (StringTokenizer tokenizer = new StringTokenizer (s, "\t\r\n", true);
433 tokenizer.hasMoreTokens (); ) {
434 String tok = tokenizer.nextToken ();
435 switch (tok.charAt (0)) {
436 case '\t':
437 {
438 int oldcol = col;
439 col = (col/tabsize + 1) * tabsize;
440 result.append (Str.repeat (" ", col - oldcol));
441 }
442 break;
443 case '\r':
444 case '\n':
445 col = 0;
446 result.append (tok);
447 break;
448 default:
449 col += tok.length ();
450 result.append (tok);
451 break;
452 }
453 }
454
455 return result.toString ();
456 }
457
458 /**
459 * Reverse a string.
460 * @param s String to reverse
461 * @return string containing characters of s in reverse order
462 */
463 public static String reverse (String s) {
464 StringBuffer t = new StringBuffer (s.length ());
465 for (int i = s.length () - 1; i >= 0; --i)
466 t.append (s.charAt(i));
467 return t.toString ();
468 }
469
470 /**
471 * Find longest common prefix of two strings.
472 */
473 public static String longestCommonPrefix (String s, String t) {
474 return s.substring (0, longestCommonPrefixLength (s, t));
475 }
476
477 public static int longestCommonPrefixLength (String s, String t) {
478 int m = Math.min (s.length (), t.length());
479 for (int k = 0; k < m; ++k)
480 if (s.charAt (k) != t.charAt (k))
481 return k;
482 return m;
483 }
484
485 /**
486 * Find longest common suffix of two strings.
487 */
488 public static String longestCommonSuffix (String s, String t) {
489 return s.substring (s.length () - longestCommonSuffixLength (s, t));
490 }
491
492 public static int longestCommonSuffixLength (String s, String t) {
493 int i = s.length ()-1;
494 int j = t.length ()-1;
495 for (; i >= 0 && j >= 0; --i, --j)
496 if (s.charAt (i) != t.charAt (j))
497 return s.length () - (i+1);
498 return s.length () - (i+1);
499 }
500
501
502
503
504 /**
505 * Find longest common prefix of two strings, ignoring case.
506 */
507 public static String longestCommonPrefixIgnoreCase (String s, String t) {
508 return s.substring (0, longestCommonPrefixLengthIgnoreCase (s, t));
509 }
510
511 public static int longestCommonPrefixLengthIgnoreCase (String s, String t) {
512 int m = Math.min (s.length (), t.length());
513 for (int k = 0; k < m; ++k)
514 if (Character.toLowerCase (s.charAt (k)) != Character.toLowerCase (t.charAt (k)))
515 return k;
516 return m;
517 }
518
519 /**
520 * Find longest common suffix of two strings, ignoring case.
521 */
522 public static String longestCommonSuffixIgnoreCase (String s, String t) {
523 return s.substring (s.length () - longestCommonSuffixLengthIgnoreCase (s, t));
524 }
525
526 public static int longestCommonSuffixLengthIgnoreCase (String s, String t) {
527 int i = s.length ()-1;
528 int j = t.length ()-1;
529 for (; i >= 0 && j >= 0; --i, --j)
530 if (Character.toLowerCase (s.charAt (i)) != Character.toLowerCase (t.charAt (j)))
531 return s.length () - (i+1);
532 return s.length () - (i+1);
533 }
534 }