Skip to content

Commit c8a0462

Browse files
authored
Add vllm api_server input output log (#12962)
1 parent 3941f32 commit c8a0462

File tree

1 file changed

+45
-1
lines changed

1 file changed

+45
-1
lines changed

python/llm/src/ipex_llm/vllm/xpu/entrypoints/openai/api_server.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,33 @@ async def show_version():
361361
return JSONResponse(content=ver)
362362

363363

364+
save_dict = {}
365+
import os
366+
flag = os.getenv("VLLM_LOG_OUTPUT", None)
367+
async def stream_generator(generator, request, request_id):
368+
async for chunk in generator:
369+
if request_id not in save_dict:
370+
save_dict[request_id] = ""
371+
import json
372+
try:
373+
data = chunk.strip()
374+
if data.startswith('data: '):
375+
data = data[len('data: '):]
376+
else:
377+
yield chunk
378+
json_data = json.loads(data)
379+
if 'choices' in json_data and len(json_data['choices']) > 0:
380+
choice = json_data['choices'][0]
381+
if 'delta' in choice:
382+
save_dict[request_id] += choice["delta"]["content"]
383+
elif 'text' in choice:
384+
save_dict[request_id] += choice["text"]
385+
except json.JSONDecodeError:
386+
print(f"Received request_id: {request_id}, request: {request} content: {save_dict[request_id]}")
387+
pass # Done
388+
yield chunk
389+
390+
364391
@router.post("/v1/chat/completions")
365392
@with_cancellation
366393
async def create_chat_completion(request: ChatCompletionRequest,
@@ -370,15 +397,24 @@ async def create_chat_completion(request: ChatCompletionRequest,
370397
return base(raw_request).create_error_response(
371398
message="The model does not support Chat Completions API")
372399

400+
if flag is not None:
401+
request_id = "chatcmpl-" \
402+
f"{handler._base_request_id(raw_request, request.request_id)}"
403+
print(f"First received request_id: {request_id}, request: {request}")
404+
373405
generator = await handler.create_chat_completion(request, raw_request)
374406

375407
if isinstance(generator, ErrorResponse):
376408
return JSONResponse(content=generator.model_dump(),
377409
status_code=generator.code)
378410

379411
elif isinstance(generator, ChatCompletionResponse):
412+
if flag is not None:
413+
print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}")
380414
return JSONResponse(content=generator.model_dump())
381415

416+
if flag is not None:
417+
return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream")
382418
return StreamingResponse(content=generator, media_type="text/event-stream")
383419

384420

@@ -390,13 +426,21 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
390426
return base(raw_request).create_error_response(
391427
message="The model does not support Completions API")
392428

429+
if flag is not None:
430+
request_id = f"cmpl-{handler._base_request_id(raw_request)}"
431+
print(f"First received request_id: {request_id}, request: {request}")
432+
393433
generator = await handler.create_completion(request, raw_request)
394434
if isinstance(generator, ErrorResponse):
395435
return JSONResponse(content=generator.model_dump(),
396436
status_code=generator.code)
397437
elif isinstance(generator, CompletionResponse):
438+
if flag is not None:
439+
print(f"Received request-id:{request_id}, request:{request}, Output:{generator.model_dump()}")
398440
return JSONResponse(content=generator.model_dump())
399-
441+
442+
if flag is not None:
443+
return StreamingResponse(content=stream_generator(generator, request, request_id), media_type="text/event-stream")
400444
return StreamingResponse(content=generator, media_type="text/event-stream")
401445

402446

0 commit comments

Comments
 (0)