UnicodeDecodeError

clehman7 · November 17, 2024, 2:24pm

Brian - I'm playing around with a modified version of the Kitchen Sink ML strategy where I've stripped out the fundamental and quality features and I'm just focusing on the price, volume, and technical features. I can tell from the logs that the backtest ran but when I go to create the tear sheet I'm receiving the following UnicodeDecodeError.

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/moonchart/perf.py:266, in DailyPerformance.from_moonshot_csv(cls, filepath_or_buffer, start_date, end_date, trim_outliers, how_to_aggregate, riskfree, compound, rolling_sharpe_window)
    265 try:
--> 266     results = read_moonshot_csv(filepath_or_buffer)
    267 except ValueError as e:
    268     # "ValueError: 'Date' is not in list" might mean the user passed
    269     # a paramscan csv by mistake

File /opt/conda/lib/python3.11/site-packages/quantrocket/moonshot.py:238, in read_moonshot_csv(filepath_or_buffer)
    210 """
    211 Load a Moonshot backtest CSV into a DataFrame.
    212 
   (...)
    236 >>> returns = results.loc["Return"]
    237 """
--> 238 return _read_moonshot_or_pnl_csv(filepath_or_buffer)

File /opt/conda/lib/python3.11/site-packages/quantrocket/utils/_parse.py:41, in _read_moonshot_or_pnl_csv(filepath_or_buffer)
     39     raise ImportError("pandas must be installed to use this function")
---> 41 results = pd.read_csv(
     42     filepath_or_buffer,
     43     parse_dates=["Date"],
     44     # columns can have mixed types, silence warning
     45     low_memory=False)
     46 index_cols = ["Field", "Date"]

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1723, in TextFileReader._make_engine(self, f, engine)
   1722 try:
-> 1723     return mapping[engine](f, **self.options)
   1724 except Exception:

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:93, in CParserWrapper.__init__(self, src, **kwds)
     92     import_optional_dependency("pyarrow")
---> 93 self._reader = parsers.TextReader(src, **kwds)
     95 self.unnamed_cols = self._reader.unnamed_cols

File parsers.pyx:579, in pandas._libs.parsers.TextReader.__cinit__()

File parsers.pyx:668, in pandas._libs.parsers.TextReader._get_header()

File parsers.pyx:879, in pandas._libs.parsers.TextReader._tokenize_rows()

File parsers.pyx:890, in pandas._libs.parsers.TextReader._check_tokenize_status()

File parsers.pyx:2050, in pandas._libs.parsers.raise_parser_error()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 15: invalid start byte

During handling of the above exception, another exception occurred:

UnicodeDecodeError                        Traceback (most recent call last)
Cell In[5], line 2
      1 from moonchart import Tearsheet
----> 2 Tearsheet.from_moonshot_csv("quantitativo_ml_results.csv")

File /opt/conda/lib/python3.11/site-packages/moonchart/tearsheet.py:129, in Tearsheet.from_moonshot_csv(cls, filepath_or_buffer, figsize, max_cols_for_details, trim_outliers, how_to_aggregate, pdf_filename, riskfree, start_date, end_date, compound, rolling_sharpe_window)
     54 @classmethod
     55 def from_moonshot_csv(
     56     cls,
   (...)
     67     rolling_sharpe_window: int = 200
     68     ) -> None:
     69     """
     70     Create a full tear sheet from a moonshot backtest results CSV.
     71 
   (...)
    127     >>> Tearsheet.from_moonshot_csv("backtest_results.csv")
    128     """
--> 129     perf = DailyPerformance.from_moonshot_csv(
    130         filepath_or_buffer,
    131         start_date=start_date,
    132         end_date=end_date,
    133         trim_outliers=trim_outliers,
    134         how_to_aggregate=how_to_aggregate,
    135         riskfree=riskfree,
    136         compound=compound,
    137         rolling_sharpe_window=rolling_sharpe_window)
    139     t = cls(figsize=figsize,
    140             max_cols_for_details=max_cols_for_details,
    141             pdf_filename=pdf_filename)
    143     return t.create_full_tearsheet(perf)

File /opt/conda/lib/python3.11/site-packages/moonchart/perf.py:272, in DailyPerformance.from_moonshot_csv(cls, filepath_or_buffer, start_date, end_date, trim_outliers, how_to_aggregate, riskfree, compound, rolling_sharpe_window)
    270 if "Date" not in repr(e):
    271     raise
--> 272 results = pd.read_csv(filepath_or_buffer)
    273 if "StrategyOrDate" in results.columns:
    274     raise MoonchartError("this looks like a parameter scan CSV, please use ParamscanTearsheet.from_csv")

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
    608 _validate_names(kwds.get("names", None))
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:
    614     return parser

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1445     self.options["has_index_names"] = kwds["has_index_names"]
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1723, in TextFileReader._make_engine(self, f, engine)
   1720     raise ValueError(msg)
   1722 try:
-> 1723     return mapping[engine](f, **self.options)
   1724 except Exception:
   1725     if self.handles is not None:

File /opt/conda/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:93, in CParserWrapper.__init__(self, src, **kwds)
     90 if kwds["dtype_backend"] == "pyarrow":
     91     # Fail here loudly instead of in cython after reading
     92     import_optional_dependency("pyarrow")
---> 93 self._reader = parsers.TextReader(src, **kwds)
     95 self.unnamed_cols = self._reader.unnamed_cols
     97 # error: Cannot determine type of 'names'

File parsers.pyx:579, in pandas._libs.parsers.TextReader.__cinit__()

File parsers.pyx:668, in pandas._libs.parsers.TextReader._get_header()

File parsers.pyx:879, in pandas._libs.parsers.TextReader._tokenize_rows()

File parsers.pyx:890, in pandas._libs.parsers.TextReader._check_tokenize_status()

File parsers.pyx:2050, in pandas._libs.parsers.raise_parser_error()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 15: invalid start byte

Any suggestions would be appreciated.

Thanks.

Brian · November 18, 2024, 1:26pm

This suggests the file is binary or has binary data in it. First, double check that it's the right file. You can also try opening it in Excel to see if there are invalid characters in it. If you're using self.save_to_results() in the strategy, try removing that as maybe something weird is getting added that way.

clehman7 · November 18, 2024, 2:56pm

@Brian I double checked that it is referencing the right file, then I removed all references to self.save_to_results() and that did not work. Then I tried opening the file in Excel and received a message that the file is corrupt and could not be opened. Any other ideas? Thanks.

Brian · November 19, 2024, 2:07pm

Hard to say without more visibility into all the steps that are taking place.