diff --git a/lib/csv.ex b/lib/csv.ex index dadde1b..4cd90d1 100644 --- a/lib/csv.ex +++ b/lib/csv.ex @@ -45,6 +45,8 @@ defmodule CSV do When set to `false` (default), will use no header values. When set to anything but `false`, the resulting rows in the matrix will be maps instead of lists. + * `:raw_line_on_error` – When set to true, raw csv line will be returned on + error tuples. Defaults to false. ## Examples @@ -209,6 +211,13 @@ defmodule CSV do line: index + 1, escape_max_lines: escape_max_lines end + defp yield_or_raise!({:error, EscapeSequenceError, escape_sequence, index, raw_line}, escape_max_lines) do + raise EscapeSequenceError, + escape_sequence: escape_sequence, + line: index + 1, + escape_max_lines: escape_max_lines, + raw_line: raw_line + end defp yield_or_raise!({:error, StrayQuoteError, field, index}, _) do raise StrayQuoteError, @@ -219,6 +228,9 @@ defmodule CSV do defp yield_or_raise!({:error, mod, message, index}, _) do raise mod, message: message, line: index + 1 end + defp yield_or_raise!({:error, mod, message, index, raw_line}, _) do + raise mod, message: message, line: index + 1, raw_line: raw_line + end defp yield_or_raise!({:ok, row}, _), do: row @@ -236,6 +248,14 @@ defmodule CSV do escape_max_lines: escape_max_lines ).message} end + defp yield_or_inline!({:error, EscapeSequenceError, escape_sequence, index, raw_line}, escape_max_lines) do + {:error, + EscapeSequenceError.exception( + escape_sequence: escape_sequence, + line: index + 1, + escape_max_lines: escape_max_lines + ).message, raw_line} + end defp yield_or_inline!({:error, StrayQuoteError, field, index}, _) do {:error, @@ -244,10 +264,20 @@ defmodule CSV do line: index + 1 ).message} end + defp yield_or_inline!({:error, StrayQuoteError, field, index, raw_line}, _) do + {:error, + StrayQuoteError.exception( + field: field, + line: index + 1 + ).message, raw_line} + end defp yield_or_inline!({:error, errormod, message, index}, _) do {:error, errormod.exception(message: message, line: index + 1).message} end + defp yield_or_inline!({:error, errormod, message, index, raw_line}, _) do + {:error, errormod.exception(message: message, line: index + 1).message, raw_line} + end defp yield_or_inline!(value, _), do: value diff --git a/lib/csv/decoding/decoder.ex b/lib/csv/decoding/decoder.ex index 3f6a0b9..1a05f62 100644 --- a/lib/csv/decoding/decoder.ex +++ b/lib/csv/decoding/decoder.ex @@ -39,6 +39,8 @@ defmodule CSV.Decoding.Decoder do be maps instead of lists. * `:replacement` – The replacement string to use where lines have bad encoding. Defaults to `nil`, which disables replacement. + * `:raw_line_on_error` – When set to true, raw csv line will be returned on + error tuples. Defaults to false. ## Examples @@ -131,13 +133,16 @@ defmodule CSV.Decoding.Decoder do defp decode_row({line, index, headers, row_length}, options) do with {:ok, parsed, _} <- parse_row({line, index}, options), - {:ok, _} <- validate_row_length({parsed, index}, row_length), + {:ok, _} <- validate_row_length({parsed, line, index}, row_length, options), do: build_row(parsed, headers) end defp parse_row({line, index}, options) do - with {:ok, lex, _} <- Lexer.lex({line, index}, options), - do: Parser.parse({lex, index}, options) + with {:ok, lex, _} <- Lexer.lex({line, index}, options) do + Parser.parse({lex, line, index}, options) + else + error -> Parser.append_raw_line?(error, line, options) + end end defp build_row(data, headers) when is_list(headers) do @@ -216,17 +221,18 @@ defmodule CSV.Decoding.Decoder do {[{line, index, headers, row_length}], {row_length, options}} end - defp validate_row_length({data, _}, false), do: {:ok, data} - defp validate_row_length({data, _}, nil), do: {:ok, data} + defp validate_row_length({data, _, _}, false, _), do: {:ok, data} + defp validate_row_length({data, _, _}, nil, _), do: {:ok, data} - defp validate_row_length({data, index}, expected_length) do + defp validate_row_length({data, line, index}, expected_length, options) do case data |> Enum.count() do ^expected_length -> {:ok, data} actual_length -> - {:error, RowLengthError, - "Row has length #{actual_length} - expected length #{expected_length}", index} + message = "Row has length #{actual_length} - expected length #{expected_length}" + {:error, RowLengthError, message, index} + |> Parser.append_raw_line?(line, options) end end end diff --git a/lib/csv/decoding/parser.ex b/lib/csv/decoding/parser.ex index febd475..c1d577e 100644 --- a/lib/csv/decoding/parser.ex +++ b/lib/csv/decoding/parser.ex @@ -17,14 +17,21 @@ defmodule CSV.Decoding.Parser do * `:strip_fields` – When set to true, will strip whitespace from fields. Defaults to false. + * `:raw_line_on_error` – When set to true, raw csv line will be returned on + error tuples. Defaults to false. """ def parse(message, options \\ []) - def parse({tokens, index}, options) do + def parse({tokens, index}, options), + do: parse({tokens, "", index}, options) + + def parse({tokens, raw_line, index}, options) do case parse([], "", tokens, :unescaped, options) do {:ok, row} -> {:ok, row, index} - {:error, type, message} -> {:error, type, message, index} + {:error, type, message} -> + {:error, type, message, index} + |> append_raw_line?(raw_line, options) end end @@ -124,4 +131,13 @@ defmodule CSV.Decoding.Parser do _ -> field end end + + @doc false + def append_raw_line?(error_tuple, raw_line, options) do + raw_line_on_error = options |> Keyword.get(:raw_line_on_error, false) + do_append_raw_line?(error_tuple, raw_line, raw_line_on_error) + end + defp do_append_raw_line?(error_tuple, raw_line, true = _raw_line_on_error), + do: Tuple.append(error_tuple, raw_line) + defp do_append_raw_line?(error_tuple, _raw_line, _raw_line_on_error), do: error_tuple end diff --git a/lib/csv/exceptions.ex b/lib/csv/exceptions.ex index 44d3c23..5ea31fa 100644 --- a/lib/csv/exceptions.ex +++ b/lib/csv/exceptions.ex @@ -3,15 +3,17 @@ defmodule CSV.EncodingError do Raised at runtime when the CSV encoding is invalid. """ - defexception [:line, :message] + defexception [:line, :message, :raw_line] def exception(options) do line = options |> Keyword.fetch!(:line) message = options |> Keyword.fetch!(:message) + raw_line = options |> Keyword.get(:raw_line) %__MODULE__{ line: line, - message: message <> " on line " <> Integer.to_string(line) + message: message <> " on line " <> Integer.to_string(line), + raw_line: raw_line } end end @@ -21,15 +23,17 @@ defmodule CSV.RowLengthError do Raised at runtime when the CSV has rows of variable length. """ - defexception [:line, :message] + defexception [:line, :message, :raw_line] def exception(options) do line = options |> Keyword.fetch!(:line) message = options |> Keyword.fetch!(:message) + raw_line = options |> Keyword.get(:raw_line) %__MODULE__{ line: line, - message: message <> " on line " <> Integer.to_string(line) + message: message <> " on line " <> Integer.to_string(line), + raw_line: raw_line } end end @@ -39,18 +43,20 @@ defmodule CSV.StrayQuoteError do Raised at runtime when the CSV row has stray quotes. """ - defexception [:line, :message] + defexception [:line, :message, :raw_line] def exception(options) do line = options |> Keyword.fetch!(:line) field = options |> Keyword.fetch!(:field) + raw_line = options |> Keyword.get(:raw_line) message = "Stray quote on line " <> Integer.to_string(line) <> " near \"" <> field <> "\"" %__MODULE__{ line: line, - message: message + message: message, + raw_line: raw_line } end end @@ -60,12 +66,13 @@ defmodule CSV.EscapeSequenceError do Raised at runtime when the CSV stream ends with unfinished escape sequences """ - defexception [:message] + defexception [:message, :raw_line] def exception(options) do line = options |> Keyword.fetch!(:line) escape_sequence = options |> Keyword.fetch!(:escape_sequence) escape_max_lines = options |> Keyword.fetch!(:escape_max_lines) + raw_line = options |> Keyword.get(:raw_line) message = "Escape sequence started on line #{line} " <> @@ -76,7 +83,8 @@ defmodule CSV.EscapeSequenceError do "it using the escape_max_lines option: https://hexdocs.pm/csv/CSV.html#decode/2" %__MODULE__{ - message: message + message: message, + raw_line: raw_line } end end diff --git a/test/decoding/append_raw_line_test.exs b/test/decoding/append_raw_line_test.exs new file mode 100644 index 0000000..217f0b1 --- /dev/null +++ b/test/decoding/append_raw_line_test.exs @@ -0,0 +1,54 @@ +defmodule DecodingTests.AppendRawLineTest do + use ExUnit.Case + import TestSupport.StreamHelpers + + alias CSV.Decoding.Decoder + alias CSV.RowLengthError + alias CSV.EscapeSequenceError + alias CSV.StrayQuoteError + alias CSV.EncodingError + + defp filter_errors(stream) do + stream + |> Stream.filter(fn + {:error, _, _, _} -> true + {:error, _, _, _, _} -> true + _ -> false + end) + end + + test "produces meaningful errors for non-unicode files returning raw_file" do + stream = "../fixtures/broken-encoding.csv" |> Path.expand(__DIR__) |> File.stream!() + + errors = stream |> Decoder.decode(raw_line_on_error: true) |> filter_errors |> Enum.to_list() + + assert [ + {:error, EncodingError, "Invalid encoding", 0, invalid_string} + ] = errors + refute String.valid?(invalid_string) + end + + test "includes an error for rows with variable length returning raw_file" do + stream = ["a,\"be\"", ",c,d", "e,f", "g,,h", "i,j", "k,l"] |> to_stream + + errors = stream |> Decoder.decode(raw_line_on_error: true) |> filter_errors |> Enum.to_list() + + assert [ + {:error, RowLengthError, "Row has length 3 - expected length 2", 1, line1}, + {:error, RowLengthError, "Row has length 3 - expected length 2", 3, line2} + ] = errors + assert [line1, line2] |> Enum.all?(&String.valid?/1) + end + + test "includes an error for rows with unescaped quotes returning raw_file" do + stream = ["a\",\"be", "\"c,d", "\"e,f\"g\",h"] |> to_stream + errors = stream |> Decoder.decode(raw_line_on_error: true) |> Enum.to_list() + + assert [ + {:error, StrayQuoteError, "a", 0, line1}, + {:error, EscapeSequenceError, "c,d", 1, line2}, + {:error, StrayQuoteError, "e,f", 2, line3} + ] = errors + assert [line1, line2, line3] |> Enum.all?(&String.valid?/1) + end +end