/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package de.danielnaber.languagetool.tools; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.regex.Pattern; import java.util.Collection; import java.util.Iterator; import java.util.List; import de.danielnaber.languagetool.Language; import de.danielnaber.languagetool.gui.Tools; import de.danielnaber.languagetool.rules.RuleMatch; import de.danielnaber.languagetool.rules.patterns.PatternRule; /** * Tools for reading files etc. * * @author Daniel Naber */ public final class StringTools { private static final int DEFAULT_CONTEXT_SIZE = 25; /** * Constants for printing XML rule matches. */ public static enum XmlPrintMode { /** * Normally output the rule matches by starting and * ending the XML output on every call. */ NORMAL_XML, /** * Start XML output by printing the preamble and the * start of the root element. */ START_XML, /** * End XML output by closing the root element. */ END_XML, /** * Simply continue rule match output. */ CONTINUE_XML } private static final Pattern XML_COMMENT_PATTERN = Pattern.compile("", Pattern.DOTALL); private static final Pattern XML_PATTERN = Pattern.compile("(?]+>", Pattern.DOTALL); private StringTools() { // only static stuff } /** * Throw exception if the given string is null or empty or only whitespace. */ public static void assureSet(final String s, final String varName) { if (s == null) { throw new NullPointerException(varName + " cannot be null"); } if (isEmpty(s.trim())) { throw new IllegalArgumentException(varName + " cannot be empty or whitespace only"); } } /** * Read a file's content. */ public static String readFile(final InputStream file) throws IOException { return readFile(file, null); } /** * Read the text file using the given encoding. * * @param file * InputStream to a file to be read * @param encoding * the file's character encoding (e.g. iso-8859-1) * @return a string with the file's content, lines separated by * \n * @throws IOException */ public static String readFile(final InputStream file, final String encoding) throws IOException { InputStreamReader isr = null; BufferedReader br = null; final StringBuilder sb = new StringBuilder(); try { if (encoding == null) { isr = new InputStreamReader(file); } else { isr = new InputStreamReader(file, encoding); } br = new BufferedReader(isr); String line; while ((line = br.readLine()) != null) { sb.append(line); sb.append('\n'); } } finally { if (br != null) { br.close(); } if (isr != null) { isr.close(); } } return sb.toString(); } /** * Returns true if str is made up of all-uppercase characters * (ignoring characters for which no upper-/lowercase distinction exists). */ public static boolean isAllUppercase(final String str) { return str.equals(str.toUpperCase()); } /** * @param str - input str * Returns true if str is MixedCase. */ public static boolean isMixedCase(final String str) { return !isAllUppercase(str) && !isCapitalizedWord(str) && !str.equals(str.toLowerCase()); } /** * @param str - input string */ public static boolean isCapitalizedWord(final String str) { if (isEmpty(str)) { return false; } final char firstChar = str.charAt(0); if (Character.isUpperCase(firstChar)) { return str.substring(1).equals(str.substring(1).toLowerCase()); } return false; } /** * Whether the first character of str is an uppercase character. */ public static boolean startsWithUppercase(final String str) { if (isEmpty(str)) { return false; } final char firstChar = str.charAt(0); if (Character.isUpperCase(firstChar)) { return true; } return false; } /** * Return str modified so that its first character is now an * uppercase character. If str starts with non-alphabetic * characters, such as quotes or parentheses, the first character is * determined as the first alphabetic character. */ public static String uppercaseFirstChar(final String str) { return changeFirstCharCase(str, true); } /** * Return str modified so that its first character is now an * lowercase character. If str starts with non-alphabetic * characters, such as quotes or parentheses, the first character is * determined as the first alphabetic character. */ public static String lowercaseFirstChar(final String str) { return changeFirstCharCase(str, false); } /** * Return str modified so that its first character is now an * lowercase or uppercase character, depending on toUpperCase. * If str starts with non-alphabetic * characters, such as quotes or parentheses, the first character is * determined as the first alphabetic character. */ private static String changeFirstCharCase(final String str, final boolean toUpperCase) { if (isEmpty(str)) { return str; } if (str.length() == 1) { return toUpperCase ? str.toUpperCase() : str.toLowerCase(); } int pos = 0; final int len = str.length() - 1; while (!Character.isLetterOrDigit(str.charAt(pos)) && len > pos) { pos++; } final char firstChar = str.charAt(pos); return str.substring(0, pos) + (toUpperCase ? Character.toUpperCase(firstChar) : Character.toLowerCase(firstChar)) + str.substring(pos + 1); } public static String readerToString(final Reader reader) throws IOException { final StringBuilder sb = new StringBuilder(); int readBytes = 0; final char[] chars = new char[4000]; while (readBytes >= 0) { readBytes = reader.read(chars, 0, 4000); if (readBytes <= 0) { break; } sb.append(new String(chars, 0, readBytes)); } return sb.toString(); } public static String streamToString(final InputStream is) throws IOException { final InputStreamReader isr = new InputStreamReader(is); try { return readerToString(isr); } finally { isr.close(); } } /** * Calls escapeHTML(String). */ public static String escapeXML(final String s) { return escapeHTML(s); } /** * Escapes these characters: less than, bigger than, quote, ampersand. */ public static String escapeHTML(final String s) { // this version is much faster than using s.replaceAll final StringBuilder sb = new StringBuilder(); final int n = s.length(); for (int i = 0; i < n; i++) { final char c = s.charAt(i); switch (c) { case '<': sb.append("<"); break; case '>': sb.append(">"); break; case '&': sb.append("&"); break; case '"': sb.append("""); break; default: sb.append(c); break; } } return sb.toString(); } /** * Get an XML representation of the given rule matches. * * @param text * the original text that was checked, used to get the context of the * matches * @param contextSize * the desired context size in characters * @deprecated Use {@link #ruleMatchesToXML(List,String,int,XmlPrintMode)} instead */ public static String ruleMatchesToXML(final List ruleMatches, final String text, final int contextSize) { return ruleMatchesToXML(ruleMatches, text, contextSize, XmlPrintMode.NORMAL_XML); } /** * Get an XML representation of the given rule matches. * @param text * the original text that was checked, used to get the context of the * matches * @param contextSize * the desired context size in characters * @param xmlMode how to print the XML */ public static String ruleMatchesToXML(final List ruleMatches, final String text, final int contextSize, final XmlPrintMode xmlMode) { // // IMPORTANT: people rely on this format, don't change it! // final StringBuilder xml = new StringBuilder(); if (xmlMode == XmlPrintMode.NORMAL_XML || xmlMode == XmlPrintMode.START_XML) { xml.append("\n"); xml.append("\n"); } for (final RuleMatch match : ruleMatches) { String subId = ""; if (match.getRule() instanceof PatternRule) { final PatternRule pRule = (PatternRule) match.getRule(); if (pRule.getSubId() != null) { subId = " subId=\"" + escapeXMLForAPIOutput(pRule.getSubId()) + "\" "; } } xml.append("", "'"); xml.append(subId); xml.append(" msg=\"" + escapeXMLForAPIOutput(msg) + "\""); final String START_MARKER = "__languagetool_start_marker"; String context = Tools.getContext(match.getFromPos(), match.getToPos(), text, contextSize, START_MARKER, "", true); xml.append(" replacements=\"" + escapeXMLForAPIOutput(listToString( match.getSuggestedReplacements(), "#")) + "\""); // get position of error in context and remove artificial marker again: final int contextOffset = context.indexOf(START_MARKER); context = context.replaceFirst(START_MARKER, ""); context = context.replaceAll("[\n\r]", " "); xml.append(" context=\"" + context + "\""); xml.append(" contextoffset=\"" + contextOffset + "\""); xml.append(" errorlength=\"" + (match.getToPos() - match.getFromPos()) + "\""); xml.append("/>\n"); } if (xmlMode == XmlPrintMode.END_XML || xmlMode == XmlPrintMode.NORMAL_XML) { xml.append("\n"); } return xml.toString(); } private static String escapeXMLForAPIOutput(final String s) { // this is simplified XML, i.e. put the "" in one line: return escapeXML(s).replaceAll("[\n\r]", " "); } public static String listToString(final Collection l, final String delimiter) { final StringBuilder sb = new StringBuilder(); for (final Iterator iter = l.iterator(); iter.hasNext();) { final String str = iter.next(); sb.append(str); if (iter.hasNext()) { sb.append(delimiter); } } return sb.toString(); } public static String getContext(final int fromPos, final int toPos, final String fileContents) { return getContext(fromPos, toPos, fileContents, DEFAULT_CONTEXT_SIZE); } public static String getContext(final int fromPos, final int toPos, final String contents, final int contextSize) { final String fileContents = contents.replace('\n', ' '); // calculate context region: int startContent = fromPos - contextSize; String prefix = "..."; String postfix = "..."; String markerPrefix = " "; if (startContent < 0) { prefix = ""; markerPrefix = ""; startContent = 0; } int endContent = toPos + contextSize; if (endContent > fileContents.length()) { postfix = ""; endContent = fileContents.length(); } // make "^" marker. inefficient but robust implementation: final StringBuilder marker = new StringBuilder(); for (int i = 0; i < fileContents.length() + prefix.length(); i++) { if (i >= fromPos && i < toPos) { marker.append('^'); } else { marker.append(' '); } } // now build context string plus marker: final StringBuilder sb = new StringBuilder(); sb.append(prefix); sb.append(fileContents.substring(startContent, endContent)); sb.append(postfix); sb.append('\n'); sb.append(markerPrefix); sb.append(marker.substring(startContent, endContent)); return sb.toString(); } /** * Filters any whitespace characters. Useful for trimming the contents of * token elements that cannot possibly contain any spaces. * * @param str * String to be filtered. * @return Filtered string. */ public static String trimWhitespace(final String str) { final StringBuilder filter = new StringBuilder(); for (int i = 0; i < str.length(); i++) { final char c = str.charAt(i); if (c != '\n' && c != ' ' && c != '\t') { filter.append(c); } } return filter.toString(); } /** * Adds spaces before words that are not punctuation. * * @param word * Word to add the preceding space. * @param language * Language of the word (to check typography conventions). Currently * French convention of not adding spaces only before '.' and ',' is * implemented; other languages assume that before ,.;:!? no spaces * should be added. * @return String containing a space or an empty string. */ public static String addSpace(final String word, final Language language) { String space = " "; final int len = word.length(); if (len == 1) { final char c = word.charAt(0); if (Language.FRENCH.equals(language)) { if (c == '.' || c == ',') { space = ""; } } else { if (c == '.' || c == ',' || c == ';' || c == ':' || c == '?' || c == '!') { space = ""; } } } return space; } /** * Returns translation of the UI element without the control character "&". To * have "&" in the UI, use "&&". * * @param label * Label to convert. * @return String UI element string without mnemonics. */ public static String getLabel(final String label) { return label.replaceAll("&([^&])", "$1"). replaceAll("&&", "&"); } /** * Returns the UI element string with mnemonics encoded in OpenOffice.org * convention (using "~"). * * @param label * Label to convert * @return String UI element with ~ replacing &. */ public static String getOOoLabel(final String label) { return label.replaceAll("&([^&])", "~$1"). replaceAll("&&", "&"); } /** * Returns mnemonic of a UI element. * * @param label * String Label of the UI element * @return @char Mnemonic of the UI element, or \u0000 in case of no mnemonic * set. */ public static char getMnemonic(final String label) { int mnemonicPos = label.indexOf('&'); while (mnemonicPos != -1 && mnemonicPos == label.indexOf("&&") && mnemonicPos < label.length()) { mnemonicPos = label.indexOf('&', mnemonicPos + 2); } if (mnemonicPos == -1 || mnemonicPos == label.length()) { return '\u0000'; } return label.charAt(mnemonicPos + 1); } /** * Checks if a string contains only whitespace, including all Unicode * whitespace. * * @param str * String to check * @return true if the string is whitespace-only. */ public static boolean isWhitespace(final String str) { if ("\u0002".equals(str) // unbreakable field, e.g. a footnote number in OOo || "\u0001".equals(str)) { // breakable field in OOo return false; } final String trimStr = str.trim(); if (isEmpty(trimStr)) { return true; } if (trimStr.length() == 1) { return java.lang.Character.isWhitespace(trimStr.charAt(0)); } return false; } /** * * @param ch * Character to check * @return True if the character is a positive number (decimal digit from 1 to * 9). */ public static boolean isPositiveNumber(final char ch) { return ch >= '1' && ch <= '9'; } /** * Helper method to replace calls to "".equals(). * * @param str * String to check * @return true if string is empty OR null */ public static boolean isEmpty(final String str) { return str == null || str.length() == 0; } /** * Simple XML filtering routing * @param str XML string to be filtered. * @return Filtered string without XML tags. */ public static String filterXML(final String str) { String s = str; s = XML_COMMENT_PATTERN.matcher(s).replaceAll(" "); s = XML_PATTERN.matcher(s).replaceAll(""); return s; } public static String asString(final CharSequence s) { if (s == null) { return null; } return s.toString(); } }