我正在学习使用Fonduer从文本文档中构建知识库。在附件的Jupyter笔记本中进行max_storage_temp_tutorial教程时,尝试执行以下代码时出现了错误:
corpus_parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
以下是我得到的错误信息:
UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)
以下是错误的堆栈跟踪:
[INFO] fonduer.utils.udf - Clearing existing...[INFO] fonduer.utils.udf - Running UDF...---------------------------------------------------------------------------UnicodeEncodeError Traceback (most recent call last)<timed eval> in <module>()~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply(self, xs, clear, parallelism, progress_bar, count, **kwargs) 48 self.logger.info("Running UDF...") 49 if parallelism is None or parallelism < 2:---> 50 self.apply_st(xs, progress_bar, clear=clear, count=count, **kwargs) 51 else: 52 self.apply_mt(xs, parallelism, clear=clear, **kwargs)~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply_st(self, xs, progress_bar, count, **kwargs) 81 82 # Commit session and close progress bar if applicable---> 83 udf.session.commit() 84 if pb: 85 pb.close()~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self) 941 raise sa_exc.InvalidRequestError("No transaction is begun.") 942 --> 943 self.transaction.commit() 944 945 def prepare(self):~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self) 465 self._assert_active(prepared_ok=True) 466 if self._state is not PREPARED:--> 467 self._prepare_impl() 468 469 if self._parent is None or self.nested:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _prepare_impl(self) 445 if self.session._is_clean(): 446 break--> 447 self.session.flush() 448 else: 449 raise exc.FlushError(~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in flush(self, objects) 2252 try: 2253 self._flushing = True-> 2254 self._flush(objects) 2255 finally: 2256 self._flushing = False~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects) 2378 except: 2379 with util.safe_reraise():-> 2380 transaction.rollback(_capture_exception=True) 2381 2382 def bulk_save_objects(~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py in __exit__(self, type_, value, traceback) 64 self._exc_info = None # remove potential circular references 65 if not self.warn_only:---> 66 compat.reraise(exc_type, exc_value, exc_tb) 67 else: 68 if not compat.py3k and self._exc_info and self._exc_info[1]:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause) 247 if value.__traceback__ is not tb: 248 raise value.with_traceback(tb)--> 249 raise value 250 251 else:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects) 2342 self._warn_on_events = True 2343 try:-> 2344 flush_context.execute() 2345 finally: 2346 self._warn_on_events = False~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute(self) 384 while set_: 385 n = set_.pop()--> 386 n.execute_aggregate(self, set_) 387 else: 388 for rec in topological.sort(~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute_aggregate(self, uow, recs) 666 [self.state] + 667 [r.state for r in our_recs],--> 668 uow) 669 670 def __repr__(self):~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in save_obj(base_mapper, states, uowtransaction, single) 179 _emit_insert_statements(base_mapper, uowtransaction, 180 cached_connections,--> 181 mapper, table, insert) 182 183 _finalize_insert_update_commands(~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in _emit_insert_statements(base_mapper, uowtransaction, cached_connections, mapper, table, insert, bookkeeping) 828 829 c = cached_connections[connection].\--> 830 execute(statement, multiparams) 831 832 if bookkeeping:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in execute(self, object, *multiparams, **params) 946 raise exc.ObjectNotExecutableError(object) 947 else:--> 948 return meth(self, multiparams, params) 949 950 def _execute_function(self, func, multiparams, params):~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/sql/elements.py in _execute_on_connection(self, connection, multiparams, params) 267 def _execute_on_connection(self, connection, multiparams, params): 268 if self.supports_execution:--> 269 return connection._execute_clauseelement(self, multiparams, params) 270 else: 271 raise exc.ObjectNotExecutableError(self)~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_clauseelement(self, elem, multiparams, params) 1058 compiled_sql, 1059 distilled_params,-> 1060 compiled_sql, distilled_params 1061 ) 1062 if self._has_events or self.engine._has_events:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1198 parameters, 1199 cursor,-> 1200 context) 1201 1202 if self._has_events or self.engine._has_events:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context) 1414 ) 1415 else:-> 1416 util.reraise(*exc_info) 1417 1418 finally:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause) 247 if value.__traceback__ is not tb: 248 raise value.with_traceback(tb)--> 249 raise value 250 251 else:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1168 statement, 1169 parameters,-> 1170 context) 1171 elif not parameters and context.no_parameters: 1172 if self.dialect._has_events:~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/dialects/postgresql/psycopg2.py in do_executemany(self, cursor, statement, parameters, context) 681 extras.execute_batch(cursor, statement, parameters) 682 else:--> 683 cursor.executemany(statement, parameters) 684 685 @util.memoized_instancemethodUnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)
在打印executemany函数的输入时,我发现存在无效的Unicode字符,但我不知道该如何处理。
请注意以下几点:
- 我已经使用教程中的download_data.sh脚本下载了pdf和html文件
-
我已经安装了设置文档中提到的所有先决条件
- Ubuntu 16.04 bash for Windows
- PostgreSQL 版本:[9.5.13]
- Poppler Utils 版本:[0.41.0-0ubuntu1.7]
- Fonduer 版本:[0.2.3]
-
教程可以在这里找到。
- 我使用了Windows上的Ubuntu来运行所需的服务
回答:
问题是由于PostgreSQL数据库的编码引起的。Fonduer需要UTF-8编码,而Windows默认使用不同的编码。
我所需要做的就是:
1.删除所需的数据库。
dropdb stg_temp_max
2.创建一个使用UTF-8编码的新数据库。
createdb -E UTF8 -T template0 stg_temp_max