001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.fileupload.util.mime; 018 019import java.io.ByteArrayOutputStream; 020import java.io.UnsupportedEncodingException; 021import java.util.Base64; 022import java.util.HashMap; 023import java.util.Locale; 024import java.util.Map; 025 026/** 027 * Utility class to decode MIME texts. 028 * 029 * @since 1.3 030 */ 031public final class MimeUtility { 032 033 /** 034 * The {@code US-ASCII} charset identifier constant. 035 */ 036 private static final String US_ASCII_CHARSET = "US-ASCII"; 037 038 /** 039 * The marker to indicate text is encoded with BASE64 algorithm. 040 */ 041 private static final String BASE64_ENCODING_MARKER = "B"; 042 043 /** 044 * The marker to indicate text is encoded with QuotedPrintable algorithm. 045 */ 046 private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q"; 047 048 /** 049 * If the text contains any encoded tokens, those tokens will be marked with "=?". 050 */ 051 private static final String ENCODED_TOKEN_MARKER = "=?"; 052 053 /** 054 * If the text contains any encoded tokens, those tokens will terminate with "=?". 055 */ 056 private static final String ENCODED_TOKEN_FINISHER = "?="; 057 058 /** 059 * The linear whitespace chars sequence. 060 */ 061 private static final String LINEAR_WHITESPACE = " \t\r\n"; 062 063 /** 064 * Mappings between MIME and Java charset. 065 */ 066 private static final Map<String, String> MIME2JAVA = new HashMap<>(); 067 068 static { 069 MIME2JAVA.put("iso-2022-cn", "ISO2022CN"); 070 MIME2JAVA.put("iso-2022-kr", "ISO2022KR"); 071 MIME2JAVA.put("utf-8", "UTF8"); 072 MIME2JAVA.put("utf8", "UTF8"); 073 MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP"); 074 MIME2JAVA.put("ja_jp.eucjp", "EUCJIS"); 075 MIME2JAVA.put("euc-kr", "KSC5601"); 076 MIME2JAVA.put("euckr", "KSC5601"); 077 MIME2JAVA.put("us-ascii", "ISO-8859-1"); 078 MIME2JAVA.put("x-us-ascii", "ISO-8859-1"); 079 } 080 081 /** 082 * Decode a string of text obtained from a mail header into 083 * its proper form. The text generally will consist of a 084 * string of tokens, some of which may be encoded using 085 * base64 encoding. 086 * 087 * @param text The text to decode. 088 * @return The decoded text string. 089 * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported. 090 */ 091 public static String decodeText(final String text) throws UnsupportedEncodingException { 092 // if the text contains any encoded tokens, those tokens will be marked with "=?". If the 093 // source string doesn't contain that sequent, no decoding is required. 094 if (!text.contains(ENCODED_TOKEN_MARKER)) { 095 return text; 096 } 097 098 int offset = 0; 099 final int endOffset = text.length(); 100 101 int startWhiteSpace = -1; 102 int endWhiteSpace = -1; 103 104 final StringBuilder decodedText = new StringBuilder(text.length()); 105 106 boolean previousTokenEncoded = false; 107 108 while (offset < endOffset) { 109 char ch = text.charAt(offset); 110 111 // is this a whitespace character? 112 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found 113 startWhiteSpace = offset; 114 while (offset < endOffset) { 115 // step over the white space characters. 116 ch = text.charAt(offset); 117 if (LINEAR_WHITESPACE.indexOf(ch) == -1) { 118 // record the location of the first non lwsp and drop down to process the 119 // token characters. 120 endWhiteSpace = offset; 121 break; 122 } 123 offset++; 124 } 125 } else { 126 // we have a word token. We need to scan over the word and then try to parse it. 127 final int wordStart = offset; 128 129 while (offset < endOffset) { 130 // step over the non white space characters. 131 ch = text.charAt(offset); 132 if (LINEAR_WHITESPACE.indexOf(ch) != -1) { 133 break; 134 } 135 offset++; 136 137 //NB: Trailing whitespace on these header strings will just be discarded. 138 } 139 // pull out the word token. 140 final String word = text.substring(wordStart, offset); 141 // is the token encoded? decode the word 142 if (word.startsWith(ENCODED_TOKEN_MARKER)) { 143 try { 144 // if this gives a parsing failure, treat it like a non-encoded word. 145 final String decodedWord = decodeWord(word); 146 147 // are any whitespace characters significant? Append 'em if we've got 'em. 148 if (!previousTokenEncoded && startWhiteSpace != -1) { 149 decodedText.append(text, startWhiteSpace, endWhiteSpace); 150 startWhiteSpace = -1; 151 } 152 // this is definitely a decoded token. 153 previousTokenEncoded = true; 154 // and add this to the text. 155 decodedText.append(decodedWord); 156 // we continue parsing from here...we allow parsing errors to fall through 157 // and get handled as normal text. 158 continue; 159 160 } catch (final ParseException e) { 161 // just ignore it, skip to next word 162 } 163 } 164 // this is a normal token, so it doesn't matter what the previous token was. Add the white space 165 // if we have it. 166 if (startWhiteSpace != -1) { 167 decodedText.append(text, startWhiteSpace, endWhiteSpace); 168 startWhiteSpace = -1; 169 } 170 // this is not a decoded token. 171 previousTokenEncoded = false; 172 decodedText.append(word); 173 } 174 } 175 176 return decodedText.toString(); 177 } 178 179 /** 180 * Parse a string using the RFC 2047 rules for an "encoded-word" 181 * type. This encoding has the syntax: 182 * 183 * encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 184 * 185 * @param word The possibly encoded word value. 186 * @return The decoded word. 187 * @throws ParseException in case of a parse error of the RFC 2047 188 * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found 189 */ 190 private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException { 191 // encoded words start with the characters "=?". If this not an encoded word, we throw a 192 // ParseException for the caller. 193 194 if (!word.startsWith(ENCODED_TOKEN_MARKER)) { 195 throw new ParseException("Invalid RFC 2047 encoded-word: " + word); 196 } 197 198 final int charsetPos = word.indexOf('?', 2); 199 if (charsetPos == -1) { 200 throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word); 201 } 202 203 // pull out the character set information (this is the MIME name at this point). 204 final String charset = word.substring(2, charsetPos).toLowerCase(Locale.ROOT); 205 206 // now pull out the encoding token the same way. 207 final int encodingPos = word.indexOf('?', charsetPos + 1); 208 if (encodingPos == -1) { 209 throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word); 210 } 211 212 final String encoding = word.substring(charsetPos + 1, encodingPos); 213 214 // and finally the encoded text. 215 final int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1); 216 if (encodedTextPos == -1) { 217 throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word); 218 } 219 220 final String encodedText = word.substring(encodingPos + 1, encodedTextPos); 221 222 // seems a bit silly to encode a null string, but easy to deal with. 223 if (encodedText.isEmpty()) { 224 return ""; 225 } 226 227 try { 228 // the decoder writes directly to an output stream. 229 final ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length()); 230 231 final byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET); 232 233 // Base64 encoded? 234 if (encoding.equals(BASE64_ENCODING_MARKER)) { 235 out.write(Base64.getDecoder().decode(encodedData)); 236 } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable. 237 QuotedPrintableDecoder.decode(encodedData, out); 238 } else { 239 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding); 240 } 241 // get the decoded byte data and convert into a string. 242 final byte[] decodedData = out.toByteArray(); 243 return new String(decodedData, javaCharset(charset)); 244 } catch (final Exception e) { 245 throw new UnsupportedEncodingException("Invalid RFC 2047 encoding"); 246 } 247 } 248 249 /** 250 * Translate a MIME standard character set name into the Java 251 * equivalent. 252 * 253 * @param charset The MIME standard name. 254 * @return The Java equivalent for this name. 255 */ 256 private static String javaCharset(final String charset) { 257 // nothing in, nothing out. 258 if (charset == null) { 259 return null; 260 } 261 262 final String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ROOT)); 263 // if there is no mapping, then the original name is used. Many of the MIME character set 264 // names map directly back into Java. The reverse isn't necessarily true. 265 if (mappedCharset == null) { 266 return charset; 267 } 268 return mappedCharset; 269 } 270 271 /** 272 * Hidden constructor, this class must not be instantiated. 273 */ 274 private MimeUtility() { 275 // do nothing 276 } 277 278}