mirror of
				https://gitee.com/dromara/hutool.git
				synced 2025-10-26 02:39:20 +08:00 
			
		
		
		
	修复CsvParser中对正文中双引号处理逻辑问题
This commit is contained in:
		| @@ -2645,7 +2645,7 @@ public class CharSequenceUtil extends StrValidator { | |||||||
| 		if (isEmpty(str)) { | 		if (isEmpty(str)) { | ||||||
| 			return toStringOrNull(str); | 			return toStringOrNull(str); | ||||||
| 		} | 		} | ||||||
| 		if (str.charAt(0) == prefix && str.charAt(str.length() - 1) == suffix) { | 		if (isWrap(str, prefix, suffix)) { | ||||||
| 			return sub(str, 1, str.length() - 1); | 			return sub(str, 1, str.length() - 1); | ||||||
| 		} | 		} | ||||||
| 		return str.toString(); | 		return str.toString(); | ||||||
|   | |||||||
| @@ -92,8 +92,8 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 	/** | 	/** | ||||||
| 	 * CSV解析器 | 	 * CSV解析器 | ||||||
| 	 * | 	 * | ||||||
| 	 * @param reader Reader | 	 * @param reader     Reader | ||||||
| 	 * @param config 配置,null则为默认配置 | 	 * @param config     配置,null则为默认配置 | ||||||
| 	 * @param bufferSize 默认缓存大小 | 	 * @param bufferSize 默认缓存大小 | ||||||
| 	 */ | 	 */ | ||||||
| 	public CsvParser(final Reader reader, final CsvReadConfig config, final int bufferSize) { | 	public CsvParser(final Reader reader, final CsvReadConfig config, final int bufferSize) { | ||||||
| @@ -109,7 +109,7 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 	 * @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法 | 	 * @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法 | ||||||
| 	 */ | 	 */ | ||||||
| 	public List<String> getHeader() { | 	public List<String> getHeader() { | ||||||
| 		if (config.headerLineNo  < 0) { | 		if (config.headerLineNo < 0) { | ||||||
| 			throw new IllegalStateException("No header available - header parsing is disabled"); | 			throw new IllegalStateException("No header available - header parsing is disabled"); | ||||||
| 		} | 		} | ||||||
| 		if (lineNo < config.beginLineNo) { | 		if (lineNo < config.beginLineNo) { | ||||||
| @@ -141,11 +141,11 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 			// 读取范围校验 | 			// 读取范围校验 | ||||||
| 			if(lineNo < config.beginLineNo){ | 			if (lineNo < config.beginLineNo) { | ||||||
| 				// 未达到读取起始行,继续 | 				// 未达到读取起始行,继续 | ||||||
| 				continue; | 				continue; | ||||||
| 			} | 			} | ||||||
| 			if(lineNo > config.endLineNo){ | 			if (lineNo > config.endLineNo) { | ||||||
| 				// 超出结束行,读取结束 | 				// 超出结束行,读取结束 | ||||||
| 				break; | 				break; | ||||||
| 			} | 			} | ||||||
| @@ -209,7 +209,7 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 	 * 空行是size为1的List,唯一元素是"" | 	 * 空行是size为1的List,唯一元素是"" | ||||||
| 	 * | 	 * | ||||||
| 	 * <p> | 	 * <p> | ||||||
| 	 *     行号要考虑注释行和引号包装的内容中的换行 | 	 * 行号要考虑注释行和引号包装的内容中的换行 | ||||||
| 	 * </p> | 	 * </p> | ||||||
| 	 * | 	 * | ||||||
| 	 * @return 一行数据 | 	 * @return 一行数据 | ||||||
| @@ -218,7 +218,7 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 	private List<String> readLine() throws IORuntimeException { | 	private List<String> readLine() throws IORuntimeException { | ||||||
| 		// 矫正行号 | 		// 矫正行号 | ||||||
| 		// 当一行内容包含多行数据时,记录首行行号,但是读取下一行时,需要把多行内容的行数加上 | 		// 当一行内容包含多行数据时,记录首行行号,但是读取下一行时,需要把多行内容的行数加上 | ||||||
| 		if(inQuotesLineCount > 0){ | 		if (inQuotesLineCount > 0) { | ||||||
| 			this.lineNo += this.inQuotesLineCount; | 			this.lineNo += this.inQuotesLineCount; | ||||||
| 			this.inQuotesLineCount = 0; | 			this.inQuotesLineCount = 0; | ||||||
| 		} | 		} | ||||||
| @@ -257,16 +257,16 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 			final char c = buf.get(); | 			final char c = buf.get(); | ||||||
|  |  | ||||||
| 			// 注释行标记 | 			// 注释行标记 | ||||||
| 			if(preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF){ | 			if (preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF) { | ||||||
| 				// 判断行首字符为指定注释字符的注释开始,直到遇到换行符 | 				// 判断行首字符为指定注释字符的注释开始,直到遇到换行符 | ||||||
| 				// 行首分两种,1是preChar < 0表示文本开始,2是换行符后紧跟就是下一行的开始 | 				// 行首分两种,1是preChar < 0表示文本开始,2是换行符后紧跟就是下一行的开始 | ||||||
| 				// issue#IA8WE0 如果注释符出现在包装符内,被认为是普通字符 | 				// issue#IA8WE0 如果注释符出现在包装符内,被认为是普通字符 | ||||||
| 				if(!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter){ | 				if (!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter) { | ||||||
| 					inComment = true; | 					inComment = true; | ||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
| 			// 注释行处理 | 			// 注释行处理 | ||||||
| 			if(inComment){ | 			if (inComment) { | ||||||
| 				if (c == CharUtil.CR || c == CharUtil.LF) { | 				if (c == CharUtil.CR || c == CharUtil.LF) { | ||||||
| 					// 注释行以换行符为结尾 | 					// 注释行以换行符为结尾 | ||||||
| 					lineNo++; | 					lineNo++; | ||||||
| @@ -302,8 +302,8 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 					buf.mark(); | 					buf.mark(); | ||||||
| 					addField(currentFields, currentField.toString()); | 					addField(currentFields, currentField.toString()); | ||||||
| 					currentField.setLength(0); | 					currentField.setLength(0); | ||||||
| 				} else if (c == config.textDelimiter) { | 				} else if (c == config.textDelimiter && isFieldBegin(preChar)) { | ||||||
| 					// 引号开始 | 					// 引号开始且出现在字段开头 | ||||||
| 					inQuotes = true; | 					inQuotes = true; | ||||||
| 					copyLen++; | 					copyLen++; | ||||||
| 				} else if (c == CharUtil.CR) { | 				} else if (c == CharUtil.CR) { | ||||||
| @@ -361,11 +361,15 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 		final char textDelimiter = this.config.textDelimiter; | 		final char textDelimiter = this.config.textDelimiter; | ||||||
|  |  | ||||||
| 		// 忽略多余引号后的换行符 | 		// 忽略多余引号后的换行符 | ||||||
| 		field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c-> c == CharUtil.LF || c == CharUtil.CR)); | 		field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c -> c == CharUtil.LF || c == CharUtil.CR)); | ||||||
|  |  | ||||||
| 		field = StrUtil.unWrap(field, textDelimiter); | 		if(StrUtil.isWrap(field, textDelimiter)){ | ||||||
| 		field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter)); | 			field = StrUtil.sub(field, 1, field.length() - 1); | ||||||
| 		if(this.config.trimField){ | 			// https://datatracker.ietf.org/doc/html/rfc4180#section-2 | ||||||
|  | 			// 第七条规则,只有包装内的包装符需要转义 | ||||||
|  | 			field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter)); | ||||||
|  | 		} | ||||||
|  | 		if (this.config.trimField) { | ||||||
| 			// issue#I49M0C@Gitee | 			// issue#I49M0C@Gitee | ||||||
| 			field = StrUtil.trim(field); | 			field = StrUtil.trim(field); | ||||||
| 		} | 		} | ||||||
| @@ -384,12 +388,30 @@ public final class CsvParser extends ComputeIter<CsvRow> implements Closeable, S | |||||||
| 		return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR; | 		return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * 通过前一个字符,判断是否字段开始,几种情况: | ||||||
|  | 	 * <ul> | ||||||
|  | 	 *     <li>正文开头,无前字符</li> | ||||||
|  | 	 *     <li>字段分隔符,即上个字段结束</li> | ||||||
|  | 	 *     <li>换行符,即新行开始</li> | ||||||
|  | 	 * </ul> | ||||||
|  | 	 * | ||||||
|  | 	 * @param preChar 前字符 | ||||||
|  | 	 * @return 是否字段开始 | ||||||
|  | 	 */ | ||||||
|  | 	private boolean isFieldBegin(final int preChar) { | ||||||
|  | 		return preChar == -1 | ||||||
|  | 			|| preChar == config.fieldSeparator | ||||||
|  | 			|| preChar == CharUtil.LF | ||||||
|  | 			|| preChar == CharUtil.CR; | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	/** | 	/** | ||||||
| 	 * 内部Buffer | 	 * 内部Buffer | ||||||
| 	 * | 	 * | ||||||
| 	 * @author looly | 	 * @author looly | ||||||
| 	 */ | 	 */ | ||||||
| 	private static class Buffer implements Serializable{ | 	private static class Buffer implements Serializable { | ||||||
| 		private static final long serialVersionUID = 1L; | 		private static final long serialVersionUID = 1L; | ||||||
|  |  | ||||||
| 		final char[] buf; | 		final char[] buf; | ||||||
|   | |||||||
| @@ -8,9 +8,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; | |||||||
|  |  | ||||||
| /** | /** | ||||||
|  * 按照 https://datatracker.ietf.org/doc/html/rfc4180#section-2<br> |  * 按照 https://datatracker.ietf.org/doc/html/rfc4180#section-2<br> | ||||||
|  * 如果字段正文中出现双引号,需要使用两个双引号表示转义 |  * 如果字段正文中出现双引号,需要使用两个双引号表示转义,并整段使用引号包裹 | ||||||
|  */ |  */ | ||||||
| public class Pr1244Test { | public class Pr1244Test { | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * 此测试中没有引号包裹,则所有引号都被当作内容 | ||||||
|  | 	 */ | ||||||
| 	@Test | 	@Test | ||||||
| 	void csvReadTest() { | 	void csvReadTest() { | ||||||
| 		final String csv = "a,q\"\"e,d,f"; | 		final String csv = "a,q\"\"e,d,f"; | ||||||
| @@ -18,6 +22,21 @@ public class Pr1244Test { | |||||||
| 		final CsvData read = reader.read(); | 		final CsvData read = reader.read(); | ||||||
| 		assertEquals(4, read.getRow(0).size()); | 		assertEquals(4, read.getRow(0).size()); | ||||||
| 		assertEquals("a", read.getRow(0).get(0)); | 		assertEquals("a", read.getRow(0).get(0)); | ||||||
|  | 		assertEquals("q\"\"e", read.getRow(0).get(1)); | ||||||
|  | 		assertEquals("d", read.getRow(0).get(2)); | ||||||
|  | 		assertEquals("f", read.getRow(0).get(3)); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/** | ||||||
|  | 	 * 此测试中没有引号包裹,则所有引号都被当作内容 | ||||||
|  | 	 */ | ||||||
|  | 	@Test | ||||||
|  | 	void csvReadTest2() { | ||||||
|  | 		final String csv = "a,q\"e,d,f"; | ||||||
|  | 		final CsvReader reader = CsvUtil.getReader(new StringReader(csv)); | ||||||
|  | 		final CsvData read = reader.read(); | ||||||
|  | 		assertEquals(4, read.getRow(0).size()); | ||||||
|  | 		assertEquals("a", read.getRow(0).get(0)); | ||||||
| 		assertEquals("q\"e", read.getRow(0).get(1)); | 		assertEquals("q\"e", read.getRow(0).get(1)); | ||||||
| 		assertEquals("d", read.getRow(0).get(2)); | 		assertEquals("d", read.getRow(0).get(2)); | ||||||
| 		assertEquals("f", read.getRow(0).get(3)); | 		assertEquals("f", read.getRow(0).get(3)); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Looly
					Looly