We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Upon installation,
pip install tika
When attempting:
In [21]: import tika ...: tika.initVM() ...: from tika import parser In [22]: parsed = parser.from_file(file_path)
I get
--------------------------------------------------------------------------- timeout Traceback (most recent call last) File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:466, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 462 except BaseException as e: 463 # Remove the TypeError from the exception chain in 464 # Python 3 (including for exceptions like SystemExit). 465 # Otherwise it looks like a bug in the code. --> 466 six.raise_from(e, None) 467 except (SocketTimeout, BaseSSLError, SocketError) as e: File <string>:3, in raise_from(value, from_value) File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:461, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 460 try: --> 461 httplib_response = conn.getresponse() 462 except BaseException as e: 463 # Remove the TypeError from the exception chain in 464 # Python 3 (including for exceptions like SystemExit). 465 # Otherwise it looks like a bug in the code. File ~/anaconda3/envs/master/lib/python3.8/http/client.py:1348, in HTTPConnection.getresponse(self) 1347 try: -> 1348 response.begin() 1349 except ConnectionError: File ~/anaconda3/envs/master/lib/python3.8/http/client.py:316, in HTTPResponse.begin(self) 315 while True: --> 316 version, status, reason = self._read_status() 317 if status != CONTINUE: File ~/anaconda3/envs/master/lib/python3.8/http/client.py:277, in HTTPResponse._read_status(self) 276 def _read_status(self): --> 277 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 278 if len(line) > _MAXLINE: File ~/anaconda3/envs/master/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b) 668 try: --> 669 return self._sock.recv_into(b) 670 except timeout: timeout: timed out During handling of the above exception, another exception occurred: ReadTimeoutError Traceback (most recent call last) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 485 try: --> 486 resp = conn.urlopen( 487 method=request.method, 488 url=url, 489 body=request.body, 490 headers=request.headers, 491 redirect=False, 492 assert_same_host=False, 493 preload_content=False, 494 decode_content=False, 495 retries=self.max_retries, 496 timeout=timeout, 497 chunked=chunked, 498 ) 500 except (ProtocolError, OSError) as err: File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:798, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 796 e = ProtocolError("Connection aborted.", e) --> 798 retries = retries.increment( 799 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] 800 ) 801 retries.sleep() File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace) 549 if read is False or not self._is_method_retryable(method): --> 550 raise six.reraise(type(error), error, _stacktrace) 551 elif read is not None: File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb) 769 raise value.with_traceback(tb) --> 770 raise value 771 finally: File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:714, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 713 # Make the request on the httplib connection object. --> 714 httplib_response = self._make_request( 715 conn, 716 method, 717 url, 718 timeout=timeout_obj, 719 body=body, 720 headers=headers, 721 chunked=chunked, 722 ) 724 # If we're going to release the connection in ``finally:``, then 725 # the response doesn't need to know about the connection. Otherwise 726 # it will also try to release it and we'll have a double-release 727 # mess. File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:468, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 467 except (SocketTimeout, BaseSSLError, SocketError) as e: --> 468 self._raise_timeout(err=e, url=url, timeout_value=read_timeout) 469 raise File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:357, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value) 356 if isinstance(err, SocketTimeout): --> 357 raise ReadTimeoutError( 358 self, url, "Read timed out. (read timeout=%s)" % timeout_value 359 ) 361 # See the above comment about EAGAIN in Python 3. In Python 2 we have 362 # to specifically catch it and throw the timeout error ReadTimeoutError: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60) During handling of the above exception, another exception occurred: ReadTimeout Traceback (most recent call last) Cell In[22], line 1 ----> 1 parsed = parser.from_file(file_path) File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/parser.py:40, in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response) 24 ''' 25 Parses a file for metadata and content 26 :param filename: path to file which needs to be parsed or binary file using open(path,'rb') (...) 37 'content' has a str value and metadata has a dict type value. 38 ''' 39 if not xmlContent: ---> 40 output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions) 41 else: 42 output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'}, 43 headers=headers, config_path=config_path, requestOptions=requestOptions) File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:337, in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions) 335 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)}) 336 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f: --> 337 status, response = callServer('put', serverEndpoint, service, f, 338 headers, verbose, tikaServerJar, config_path=config_path, 339 rawResponse=rawResponse, requestOptions=requestOptions) 341 if file_type == 'remote': os.unlink(path) 342 return (status, response) File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:555, in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions) 552 effectiveRequestOptions = requestOptionsDefault.copy() 553 effectiveRequestOptions.update(requestOptions) --> 555 resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions) 557 if verbose: 558 print(sys.stderr, "Request headers: ", headers) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:130, in put(url, data, **kwargs) 118 def put(url, data=None, **kwargs): 119 r"""Sends a PUT request. 120 121 :param url: URL for the new :class:`Request` object. (...) 127 :rtype: requests.Response 128 """ --> 130 return request("put", url, data=data, **kwargs) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:59, in request(method, url, **kwargs) 55 # By using the 'with' statement we are sure the session is closed, thus we 56 # avoid leaving sockets open which can trigger a ResourceWarning in some 57 # cases, and look like a memory leak in others. 58 with sessions.Session() as session: ---> 59 return session.request(method=method, url=url, **kwargs) File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 584 send_kwargs = { 585 "timeout": timeout, 586 "allow_redirects": allow_redirects, 587 } 588 send_kwargs.update(settings) --> 589 resp = self.send(prep, **send_kwargs) 591 return resp File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs) 700 start = preferred_clock() 702 # Send the request --> 703 r = adapter.send(request, **kwargs) 705 # Total elapsed time of the request (approximately) 706 elapsed = preferred_clock() - start File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 530 raise SSLError(e, request=request) 531 elif isinstance(e, ReadTimeoutError): --> 532 raise ReadTimeout(e, request=request) 533 elif isinstance(e, _InvalidHeader): 534 raise InvalidHeader(e, request=request) ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60) In [23]:
How can I overcome it?
The text was updated successfully, but these errors were encountered:
Nevermind, I have missed
TIKA_SERVER_JAR="file:////tika-server-standard.jar
After setting this environment variable, it worked.
Sorry, something went wrong.
No branches or pull requests
Upon installation,
When attempting:
I get
How can I overcome it?
The text was updated successfully, but these errors were encountered: