001 /* 002 * LAPIS lightweight structured text processing system 003 * 004 * Copyright (C) 1998-2002 Carnegie Mellon University, 005 * Copyright (C) 2003 Massachusetts Institute of Technology. 006 * All rights reserved. 007 * 008 * This library is free software; you can redistribute it 009 * and/or modify it under the terms of the GNU General 010 * Public License as published by the Free Software 011 * Foundation, version 2. 012 * 013 * LAPIS homepage: http://graphics.lcs.mit.edu/lapis/ 014 */ 015 016 017 package lapisx.util; 018 019 import java.util.StringTokenizer; 020 021 /** 022 * String utility routines. 023 */ 024 public abstract class Str { 025 026 /** 027 * Find first occurence of any of a set of characters. 028 * @param subject String in which to search 029 * @param chars Characters to search for 030 * @return index of first occurence in subject of a character from chars, 031 * or -1 if no match. 032 */ 033 public static int indexOfAnyChar (String subject, String chars) { 034 return indexOfAnyChar (subject, chars, 0); 035 } 036 037 /** 038 * Find first occurence of any of a set of characters, starting 039 * at a specified index. 040 * @param subject String in which to search 041 * @param chars Characters to search for 042 * @param start Starting offset to search from 043 * @return index of first occurence (after start) in subject of a character from chars, 044 * or -1 if no match. 045 */ 046 public static int indexOfAnyChar (String subject, String chars, int start) { 047 for (int i=start; i<subject.length(); ++i) 048 if (chars.indexOf (subject.charAt (i)) != -1) 049 return i; 050 return -1; 051 } 052 053 /** 054 * Replace all occurences of a string. 055 * @param subject String in which to search 056 * @param original String to search for in subject 057 * @param replacement String to substitute 058 * @return subject with all occurences of original replaced by replacement 059 */ 060 public static String replace (String subject, String original, String replacement) { 061 StringBuffer output = new StringBuffer (); 062 063 int p = 0; 064 int i; 065 while ((i = subject.indexOf (original, p)) != -1) { 066 output.append (subject.substring (p, i)); 067 output.append (replacement); 068 p = i + original.length(); 069 } 070 if (p < subject.length ()) 071 output.append (subject.substring(p)); 072 return output.toString (); 073 } 074 075 /** 076 * Escapes metacharacters in a string. 077 * @param subject String in which metacharacters are to be escaped 078 * @param escapeChar the escape character (e.g., \) 079 * @param metachars the metacharacters that should be escaped 080 * @return subject with escapeChar inserted before every character found in metachars 081 */ 082 public static String escape (String subject, char escapeChar, String metachars) { 083 return escape (subject, metachars, escapeChar, metachars); 084 } 085 086 /** 087 * Escapes characters in a string. 088 * @param subject String in which metacharacters are to be escaped 089 * @param chars Characters that need to be escaped (e.g. "\b\t\r\n\\") 090 * @param escapeChar the escape character (e.g., '\\') 091 * @param metachars escape code letters corresponding to each letter in chars (e.g. "btrn\\") 092 * <B>Must have metachars.length () == chars.length().</B> 093 * @return subject where every occurence of c in chars is replaced 094 * by escapeChar followed the character corresponding to c in metachars. 095 * 096 */ 097 public static String escape (String subject, String chars, char escapeChar, String metachars) { 098 StringBuffer output = new StringBuffer (); 099 100 int p = 0; 101 int i; 102 while ((i = indexOfAnyChar (subject, chars, p)) != -1) { 103 output.append (subject.substring (p, i)); 104 105 char c = subject.charAt (i); // character that needs escaping 106 int k = chars.indexOf (c); 107 char metac = metachars.charAt (k); // its corresponding metachar 108 output.append (escapeChar); 109 output.append (metac); 110 111 p = i + 1; 112 } 113 if (p < subject.length ()) 114 output.append (subject.substring(p)); 115 return output.toString (); 116 } 117 118 /** 119 * Translate escape sequences (e.g. \r, \n) to characters. 120 * @param subject String in which metacharacters are to be escaped 121 * @param escapeChar the escape character (e.g., \) 122 * @param metachars letters representing escape codes (typically "btrn\\") 123 * @param chars characters corresponding to metachars (typically "\b\t\r\n\\"). 124 * <B>Must have chars.length () == metachars.length().</B> 125 * @param keepUntranslatedEscapes Controls behavior on unknown escape sequences 126 * (see below). 127 * @return subject where every escapeChar followed by c in metachars 128 * is replaced by the character corresponding to c in chars. If an escape 129 * sequence is untranslatable (because escapeChar is followed by some character c 130 * not in metachars), then the escapeChar is kept if keepUntranslatedEscapes is true, 131 * otherwise the escapeChar is deleted. (The character c is always kept.) 132 * 133 */ 134 public static String unescape (String subject, char escapeChar, String metachars, String chars, boolean keepUntranslatedEscapes) { 135 StringBuffer output = new StringBuffer (); 136 137 int p = 0; 138 int i; 139 int len = subject.length (); 140 while ((i = subject.indexOf (escapeChar, p)) != -1) { 141 output.append (subject.substring (p, i)); 142 if (i + 1 == len) 143 break; 144 145 char metac = subject.charAt (i+1); // metachar to replace 146 int k = metachars.indexOf (metac); 147 if (k == -1) { 148 // untranslatable sequence 149 if (keepUntranslatedEscapes) 150 output.append (escapeChar); 151 output.append (metac); 152 } 153 else 154 output.append (chars.charAt (k)); // its corresponding true char 155 156 p = i + 2; // skip over both escapeChar & metac 157 } 158 159 if (p < len) 160 output.append (subject.substring(p)); 161 return output.toString (); 162 } 163 164 /** 165 * Parse a number from a string. Finds the first recognizable base-10 number (integer or floating point) 166 * in the string and returns it as a Number. Uses American English conventions 167 * (i.e., '.' as decimal point and ',' as thousands separator). 168 * @param string String to parse 169 * @return first recognizable number 170 * @exception NumberFormatException if no recognizable number is found 171 */ 172 private static final int INT = 0; 173 private static final int FRAC = 1; 174 private static final int EXP = 2; 175 public static Number parseNumber (String s) throws NumberFormatException { 176 int p = 0; 177 for (int i=0; i<s.length(); ++i) { 178 char c = s.charAt (i); 179 if (Character.isDigit (c)) { 180 int start = i; 181 int end = ++i; 182 int state = INT; 183 184 if (start > 0 && s.charAt (start-1) == '.') { 185 --start; 186 state = FRAC; 187 } 188 if (start > 0 && s.charAt (start-1) == '-') 189 --start; 190 191 foundEnd: 192 while (i < s.length()) { 193 switch (s.charAt (i)) { 194 case '0': case '1': case '2': case '3': case '4': 195 case '5': case '6': case '7': case '8': case '9': 196 end = ++i; 197 break; 198 case '.': 199 if (state != INT) 200 break foundEnd; 201 state = FRAC; 202 ++i; 203 break; 204 case ',': // ignore commas 205 ++i; 206 break; 207 case 'e': 208 case 'E': 209 state = EXP; 210 ++i; 211 if (i < s.length() && 212 ( (c = s.charAt (i)) == '+' || c == '-') ) 213 ++i; 214 break; 215 default: 216 break foundEnd; 217 } 218 } 219 220 String num = s.substring (start, end); 221 num = replace (num, ",", ""); 222 try { 223 if (state == INT) 224 return new Integer (num); 225 else 226 return new Float (num); 227 } catch (NumberFormatException e) { 228 throw new RuntimeException ("internal error: " + e); 229 } 230 } 231 } 232 throw new NumberFormatException (s); 233 } 234 /* 235 For testing parseNumber 236 237 public static void main (String[] args) { 238 for (int i=0; i<args.length; ++i) 239 System.out.println (parseNumber (args[i])); 240 } 241 */ 242 243 244 /** 245 * Generate a string by concatenating n copies of another string. 246 * @param s String to repeat 247 * @param n number of times to repeat s 248 * @return s concatenated with itself n times 249 */ 250 public static String repeat (String s, int n) { 251 StringBuffer out = new StringBuffer (); 252 while (--n >= 0) 253 out.append (s); 254 return out.toString (); 255 } 256 257 /** 258 * Compress whitespace. 259 * @param s String to compress 260 * @return string with leading and trailing whitespace removed, and 261 * internal runs of whitespace replaced by a single space character 262 */ 263 public static String compressWhitespace (String s) { 264 StringBuffer output = new StringBuffer (); 265 int p = 0; 266 boolean inSpace = true; 267 for (int i = 0, len = s.length (); i < len; ++i) { 268 if (Character.isWhitespace (s.charAt (i))) { 269 if (!inSpace) { 270 output.append (s.substring (p, i)); 271 output.append (' '); 272 inSpace = true; 273 } 274 } 275 else { 276 if (inSpace) { 277 p = i; 278 inSpace = false; 279 } 280 } 281 } 282 if (!inSpace) 283 output.append (s.substring (p)); 284 return output.toString (); 285 } 286 287 /** 288 * Test if string contains only whitespace. 289 * @param s String to test 290 * @return true iff all characters in s satisfy Character.isWhitespace(). 291 * If s is empty, returns true. 292 */ 293 public static boolean isWhitespace (String s) { 294 for (int i = 0, n = s.length (); i < n; ++i) 295 if (!Character.isWhitespace (s.charAt (i))) 296 return false; 297 return true; 298 } 299 300 /** 301 * Concatenate an array of strings. 302 * @param list Array of strings to concatenate 303 * @param sep Separator to insert between each string 304 * @return string consisting of list[0] + sep + list[1] + sep + ... + sep + list[list.length-1] 305 */ 306 public static String join (String[] list, String sep) { 307 StringBuffer result = new StringBuffer (); 308 for (int i=0; i < list.length; ++i) { 309 if (i > 0) 310 result.append (sep); 311 result.append (list[i]); 312 } 313 return result.toString (); 314 } 315 316 /** 317 * Abbreviate a string. 318 * @param s String to abbreviate 319 * @param max Maximum length of returned string; must be at least 5 320 * s with linebreaks removed and enough characters removed from 321 * the middle (replaced by "...") to make length <= max 322 */ 323 public static String abbreviate (String s, int max) { 324 s = compressWhitespace (s); 325 if (s.length() < max) 326 return s; 327 else { 328 max = Math.max (max-3, 2); // for "..." 329 int half = max/2; 330 return s.substring (0, half) + "..." + s.substring (s.length()-half); 331 } 332 } 333 334 /** 335 * Abbreviate a multi-line string. 336 * @param s String to abbreviate 337 * @param maxLines Max number of lines in returned string; must be at least 3 338 * @param message Message to replace removed lines with; should end with 339 * \n, but may be multiple lines. Occurrences of %d are replaced with 340 * the number of lines removed. 341 * s with enough whole lines removed from 342 * the middle (replaced by message) to make its length in lines <= max 343 */ 344 public static String abbreviateLines (String s, int maxLines, String message) { 345 int nLines = countLines (s); 346 if (nLines < maxLines) 347 return s; 348 else { 349 maxLines = Math.max (maxLines-1, 2); // take out one line for "..." 350 int half = maxLines/2; 351 return s.substring (0, nthLine (s, half)) 352 + replace (message, "%d", String.valueOf (nLines - half*2)) 353 + s.substring (nthLine (s, -half)); 354 } 355 } 356 357 static int countLines (String s) { 358 int n = 1; 359 int i = -1; 360 while ((i = s.indexOf ('\n', i+1)) != -1) 361 ++n; 362 return n; 363 } 364 static int nthLine (String s, int n) { 365 if (n >= 0) { 366 int i = -1; 367 while (n > 0 && (i = s.indexOf ('\n', i+1)) != -1) 368 --n; 369 return i+1; 370 } else { 371 int i = s.length (); 372 while (n < 0 && (i = s.lastIndexOf ('\n', i-1)) != -1) 373 ++n; 374 return i+1; 375 } 376 } 377 378 /** 379 * Split string around a substring match and return prefix. 380 * @param s String to split 381 * @param pat Substring to search for in s 382 * @return Prefix of s ending just before the first occurrence 383 * of pat. If pat is not found in s, returns s itself. 384 */ 385 public static String before (String s, String pat) { 386 int i = s.indexOf (pat); 387 return (i >= 0) ? s.substring(0, i) : s; 388 } 389 390 /** 391 * Split string around a substring match and return suffix. 392 * @param s String to split 393 * @param pat Substring to search for in s 394 * @return Suffix of s starting just after the first occurrence 395 * of pat. If pat is not found in s, returns "". 396 */ 397 public static String after (String s, String pat) { 398 int i = s.indexOf (pat); 399 return (i >= 0) ? s.substring(i + pat.length ()) : ""; 400 } 401 402 403 /** 404 * Like String.startsWith, but case-insensitive. 405 */ 406 public static boolean startsWithIgnoreCase (String s, String prefix) { 407 int sLen = s.length (); 408 int prefixLen = prefix.length (); 409 return (sLen >= prefixLen 410 && s.substring (0, prefixLen).equalsIgnoreCase (prefix)); 411 } 412 413 /** 414 * Like String.endsWith, but case-insensitive. 415 */ 416 public static boolean endsWithIgnoreCase (String s, String suffix) { 417 int sLen = s.length (); 418 int suffixLen = suffix.length (); 419 return (sLen >= suffixLen 420 && s.substring (sLen - suffixLen).equalsIgnoreCase (suffix)); 421 } 422 423 /** 424 * Expands tabs to spaces. 425 */ 426 public static String untabify (String s, int tabsize) { 427 if (s.indexOf ('\t') == -1) 428 return s; // no tabs, don't bother 429 430 int col = 0; 431 StringBuffer result = new StringBuffer (); 432 for (StringTokenizer tokenizer = new StringTokenizer (s, "\t\r\n", true); 433 tokenizer.hasMoreTokens (); ) { 434 String tok = tokenizer.nextToken (); 435 switch (tok.charAt (0)) { 436 case '\t': 437 { 438 int oldcol = col; 439 col = (col/tabsize + 1) * tabsize; 440 result.append (Str.repeat (" ", col - oldcol)); 441 } 442 break; 443 case '\r': 444 case '\n': 445 col = 0; 446 result.append (tok); 447 break; 448 default: 449 col += tok.length (); 450 result.append (tok); 451 break; 452 } 453 } 454 455 return result.toString (); 456 } 457 458 /** 459 * Reverse a string. 460 * @param s String to reverse 461 * @return string containing characters of s in reverse order 462 */ 463 public static String reverse (String s) { 464 StringBuffer t = new StringBuffer (s.length ()); 465 for (int i = s.length () - 1; i >= 0; --i) 466 t.append (s.charAt(i)); 467 return t.toString (); 468 } 469 470 /** 471 * Find longest common prefix of two strings. 472 */ 473 public static String longestCommonPrefix (String s, String t) { 474 return s.substring (0, longestCommonPrefixLength (s, t)); 475 } 476 477 public static int longestCommonPrefixLength (String s, String t) { 478 int m = Math.min (s.length (), t.length()); 479 for (int k = 0; k < m; ++k) 480 if (s.charAt (k) != t.charAt (k)) 481 return k; 482 return m; 483 } 484 485 /** 486 * Find longest common suffix of two strings. 487 */ 488 public static String longestCommonSuffix (String s, String t) { 489 return s.substring (s.length () - longestCommonSuffixLength (s, t)); 490 } 491 492 public static int longestCommonSuffixLength (String s, String t) { 493 int i = s.length ()-1; 494 int j = t.length ()-1; 495 for (; i >= 0 && j >= 0; --i, --j) 496 if (s.charAt (i) != t.charAt (j)) 497 return s.length () - (i+1); 498 return s.length () - (i+1); 499 } 500 501 502 503 504 /** 505 * Find longest common prefix of two strings, ignoring case. 506 */ 507 public static String longestCommonPrefixIgnoreCase (String s, String t) { 508 return s.substring (0, longestCommonPrefixLengthIgnoreCase (s, t)); 509 } 510 511 public static int longestCommonPrefixLengthIgnoreCase (String s, String t) { 512 int m = Math.min (s.length (), t.length()); 513 for (int k = 0; k < m; ++k) 514 if (Character.toLowerCase (s.charAt (k)) != Character.toLowerCase (t.charAt (k))) 515 return k; 516 return m; 517 } 518 519 /** 520 * Find longest common suffix of two strings, ignoring case. 521 */ 522 public static String longestCommonSuffixIgnoreCase (String s, String t) { 523 return s.substring (s.length () - longestCommonSuffixLengthIgnoreCase (s, t)); 524 } 525 526 public static int longestCommonSuffixLengthIgnoreCase (String s, String t) { 527 int i = s.length ()-1; 528 int j = t.length ()-1; 529 for (; i >= 0 && j >= 0; --i, --j) 530 if (Character.toLowerCase (s.charAt (i)) != Character.toLowerCase (t.charAt (j))) 531 return s.length () - (i+1); 532 return s.length () - (i+1); 533 } 534 }