Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added ignoreQuoteInToken support to ignore quotes in strings #46

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 70 additions & 21 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ public CSVFormat getFormat() {
* @see Predefined#Default
*/
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF,
null, null, null, false, false, false, false, false, false, true);
null, null, null, false, false, false, false, false, false, true, false);

/**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is
Expand Down Expand Up @@ -673,7 +673,7 @@ private static boolean isLineBreak(final Character c) {
*/
public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false,
false, false, false, false, true);
false, false, false, false, true, false);
}

/**
Expand Down Expand Up @@ -709,6 +709,8 @@ public static CSVFormat valueOf(final String format) {
private final boolean ignoreHeaderCase; // should ignore header names case

private final boolean ignoreSurroundingSpaces; // Should leading/trailing spaces be ignored around values?

private final boolean ignoreQuotesInToken; //should ignore quotes in the token

private final String nullString; // the string to be used for null values

Expand Down Expand Up @@ -762,6 +764,8 @@ public static CSVFormat valueOf(final String format) {
* @param trailingDelimiter
* TODO
* @param autoFlush
* @param ignoreQuotesInToken
* the quotes within a string token will be ignored
* @throws IllegalArgumentException
* if the delimiter is a line break character
*/
Expand All @@ -770,7 +774,8 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo
final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord,
final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim,
final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) {
final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames,
final boolean ignoreQuotesInToken) {
this.delimiter = delimiter;
this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode;
Expand All @@ -790,6 +795,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo
this.autoFlush = autoFlush;
this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
this.allowDuplicateHeaderNames = allowDuplicateHeaderNames;
this.ignoreQuotesInToken = ignoreQuotesInToken;
validate();
}

Expand Down Expand Up @@ -864,6 +870,9 @@ public boolean equals(final Object obj) {
if (ignoreSurroundingSpaces != other.ignoreSurroundingSpaces) {
return false;
}
if (ignoreQuotesInToken != other.ignoreQuotesInToken) {
return false;
}
if (ignoreEmptyLines != other.ignoreEmptyLines) {
return false;
}
Expand Down Expand Up @@ -1004,6 +1013,16 @@ public boolean getIgnoreHeaderCase() {
public boolean getIgnoreSurroundingSpaces() {
return ignoreSurroundingSpaces;
}

/**
* Specifies whether quotes in token are ignored when parsing input.
*
* @return {@code true} to allow quotes anywhwere in the string,
* {@code false} to ensure quotes come in the beginning and end of string only.
*/
public boolean getIgnoreQuotesInToken() {
return ignoreQuotesInToken;
}

/**
* Gets the String to convert to and from {@code null}.
Expand Down Expand Up @@ -1088,6 +1107,7 @@ public int hashCode() {
result = prime * result + ((escapeCharacter == null) ? 0 : escapeCharacter.hashCode());
result = prime * result + ((nullString == null) ? 0 : nullString.hashCode());
result = prime * result + (ignoreSurroundingSpaces ? 1231 : 1237);
result = prime * result + (ignoreQuotesInToken ? 1231 : 1237);
result = prime * result + (ignoreHeaderCase ? 1231 : 1237);
result = prime * result + (ignoreEmptyLines ? 1231 : 1237);
result = prime * result + (skipHeaderRecord ? 1231 : 1237);
Expand Down Expand Up @@ -1618,6 +1638,9 @@ public String toString() {
if (getIgnoreSurroundingSpaces()) {
sb.append(" SurroundingSpaces:ignored");
}
if (getIgnoreQuotesInToken()) {
sb.append(" QuotesInToken:ignored");
}
if (getIgnoreHeaderCase()) {
sb.append(" IgnoreHeaderCase:ignored");
}
Expand Down Expand Up @@ -1734,7 +1757,7 @@ public CSVFormat withAllowDuplicateHeaderNames(final boolean allowDuplicateHeade
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -1760,7 +1783,7 @@ public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNam
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -1776,7 +1799,7 @@ public CSVFormat withAutoFlush(final boolean autoFlush) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -1812,7 +1835,7 @@ public CSVFormat withCommentMarker(final Character commentMarker) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -1831,7 +1854,7 @@ public CSVFormat withDelimiter(final char delimiter) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -1863,7 +1886,7 @@ public CSVFormat withEscape(final Character escape) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces,
ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -2020,7 +2043,7 @@ public CSVFormat withHeader(final String... header) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -2042,7 +2065,7 @@ public CSVFormat withHeaderComments(final Object... headerComments) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -2068,7 +2091,7 @@ public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -2095,7 +2118,7 @@ public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -2121,9 +2144,35 @@ public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpac
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
* Returns a new {@code CSVFormat} with the parser allowing quotes anywhere in the string {@code true}.
*
* @return A new CSVFormat that is equal to this but with quotes allowed anywhere in the string.
* @see #withIgnoreQuotesInToken(boolean)
* @since 1.9
*/
public CSVFormat withIgnoreQuotesInToken() {
return this.withIgnoreQuotesInToken(true);
}

/**
* Returns a new {@code CSVFormat} with the parser with quotes anywhere in the string set to the given value.
*
* @param ignoreQuotesInToken
* parser with quotes anywhere in the string, {@code true} to allow quotes anywhwere in the string,
* {@code false} to ensure quotes come in the beginning and end of string only
* @return A new CSVFormat that is equal to this but with quotes allowed anywhere in the string.
*/
public CSVFormat withIgnoreQuotesInToken(final boolean ignoreQuotesInToken) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
* Returns a new {@code CSVFormat} with conversions to and from null for strings on input and output.
* <ul>
Expand All @@ -2141,7 +2190,7 @@ public CSVFormat withNullString(final String nullString) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -2173,7 +2222,7 @@ public CSVFormat withQuote(final Character quoteChar) {
return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces,
ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand All @@ -2188,7 +2237,7 @@ public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) {
return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -2227,7 +2276,7 @@ public CSVFormat withRecordSeparator(final String recordSeparator) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -2255,7 +2304,7 @@ public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -2297,7 +2346,7 @@ public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames, ignoreQuotesInToken);
}

/**
Expand Down Expand Up @@ -2325,6 +2374,6 @@ public CSVFormat withTrim(final boolean trim) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
allowDuplicateHeaderNames,ignoreQuotesInToken);
}
}
4 changes: 4 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ final class Lexer implements Closeable {

private final boolean ignoreSurroundingSpaces;
private final boolean ignoreEmptyLines;
private final boolean ignoreQuotesInToken;

/** The input stream */
private final ExtendedBufferedReader reader;
Expand All @@ -72,6 +73,7 @@ String getFirstEol(){
this.commentStart = mapNullToDisabled(format.getCommentMarker());
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
this.ignoreQuotesInToken = format.getIgnoreQuotesInToken();
}

/**
Expand Down Expand Up @@ -276,6 +278,8 @@ private Token parseEncapsulatedToken(final Token token) throws IOException {
} else if (readEndOfLine(c)) {
token.type = EORECORD;
return token;
} else if(ignoreQuotesInToken) {
token.content.append((char)c);
} else if (!isWhitespace(c)) {
// error invalid char between token and next delimiter
throw new IOException("(line " + getCurrentLineNumber() +
Expand Down
6 changes: 6 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVFormatTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1051,6 +1051,12 @@ public void testWithIgnoreEmptyLines() throws Exception {
assertFalse(CSVFormat.DEFAULT.withIgnoreEmptyLines(false).getIgnoreEmptyLines());
assertTrue(CSVFormat.DEFAULT.withIgnoreEmptyLines().getIgnoreEmptyLines());
}

@Test
public void testWithIgnoreQuotesInToken() throws Exception {
assertFalse(CSVFormat.DEFAULT.withIgnoreQuotesInToken(false).getIgnoreQuotesInToken());
assertTrue(CSVFormat.DEFAULT.withIgnoreQuotesInToken().getIgnoreQuotesInToken());
}


@Test
Expand Down
27 changes: 27 additions & 0 deletions src/test/java/org/apache/commons/csv/LexerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,15 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.io.StringReader;

import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.internal.runners.statements.Fail;

/**
*
Expand Down Expand Up @@ -68,6 +71,30 @@ public void testSurroundingSpacesAreDeleted() throws IOException {
assertThat(parser.nextToken(new Token()), matches(EOF, ""));
}
}

@Test
public void testIgnoreQuotesInTokenTrue() throws IOException {
final String code = "abc,\"xyz\" 123 bar,3,11961034,\"First author, Second Author\"";
try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT.withIgnoreQuotesInToken())) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "abc"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "xyz 123 bar"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "3"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "11961034"));
assertThat(parser.nextToken(new Token()), matches(EOF, "First author, Second Author"));
}
}

@Test
public void testIgnoreQuotesInTokenFalse() throws IOException {
final String code = "abc,\"xyz\" 123 bar,3,11961034,\"First author, Second Author\"";
try (final Lexer parser = createLexer(code, CSVFormat.DEFAULT)) {
assertThat(parser.nextToken(new Token()), matches(TOKEN, "abc"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "xyz 123 bar"));
fail();
} catch (IOException e) {
assertTrue(e.getMessage().equals("(line 1) invalid char between encapsulated token and delimiter"));
}
}

@Test
public void testSurroundingTabsAreDeleted() throws IOException {
Expand Down