@@ -361,6 +361,33 @@ async def show_version():
361
361
return JSONResponse (content = ver )
362
362
363
363
364
+ save_dict = {}
365
+ import os
366
+ flag = os .getenv ("VLLM_LOG_OUTPUT" , None )
367
+ async def stream_generator (generator , request , request_id ):
368
+ async for chunk in generator :
369
+ if request_id not in save_dict :
370
+ save_dict [request_id ] = ""
371
+ import json
372
+ try :
373
+ data = chunk .strip ()
374
+ if data .startswith ('data: ' ):
375
+ data = data [len ('data: ' ):]
376
+ else :
377
+ yield chunk
378
+ json_data = json .loads (data )
379
+ if 'choices' in json_data and len (json_data ['choices' ]) > 0 :
380
+ choice = json_data ['choices' ][0 ]
381
+ if 'delta' in choice :
382
+ save_dict [request_id ] += choice ["delta" ]["content" ]
383
+ elif 'text' in choice :
384
+ save_dict [request_id ] += choice ["text" ]
385
+ except json .JSONDecodeError :
386
+ print (f"Received request_id: { request_id } , request: { request } content: { save_dict [request_id ]} " )
387
+ pass # Done
388
+ yield chunk
389
+
390
+
364
391
@router .post ("/v1/chat/completions" )
365
392
@with_cancellation
366
393
async def create_chat_completion (request : ChatCompletionRequest ,
@@ -370,15 +397,24 @@ async def create_chat_completion(request: ChatCompletionRequest,
370
397
return base (raw_request ).create_error_response (
371
398
message = "The model does not support Chat Completions API" )
372
399
400
+ if flag is not None :
401
+ request_id = "chatcmpl-" \
402
+ f"{ handler ._base_request_id (raw_request , request .request_id )} "
403
+ print (f"First received request_id: { request_id } , request: { request } " )
404
+
373
405
generator = await handler .create_chat_completion (request , raw_request )
374
406
375
407
if isinstance (generator , ErrorResponse ):
376
408
return JSONResponse (content = generator .model_dump (),
377
409
status_code = generator .code )
378
410
379
411
elif isinstance (generator , ChatCompletionResponse ):
412
+ if flag is not None :
413
+ print (f"Received request-id:{ request_id } , request:{ request } , Output:{ generator .model_dump ()} " )
380
414
return JSONResponse (content = generator .model_dump ())
381
415
416
+ if flag is not None :
417
+ return StreamingResponse (content = stream_generator (generator , request , request_id ), media_type = "text/event-stream" )
382
418
return StreamingResponse (content = generator , media_type = "text/event-stream" )
383
419
384
420
@@ -390,13 +426,21 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
390
426
return base (raw_request ).create_error_response (
391
427
message = "The model does not support Completions API" )
392
428
429
+ if flag is not None :
430
+ request_id = f"cmpl-{ handler ._base_request_id (raw_request )} "
431
+ print (f"First received request_id: { request_id } , request: { request } " )
432
+
393
433
generator = await handler .create_completion (request , raw_request )
394
434
if isinstance (generator , ErrorResponse ):
395
435
return JSONResponse (content = generator .model_dump (),
396
436
status_code = generator .code )
397
437
elif isinstance (generator , CompletionResponse ):
438
+ if flag is not None :
439
+ print (f"Received request-id:{ request_id } , request:{ request } , Output:{ generator .model_dump ()} " )
398
440
return JSONResponse (content = generator .model_dump ())
399
-
441
+
442
+ if flag is not None :
443
+ return StreamingResponse (content = stream_generator (generator , request , request_id ), media_type = "text/event-stream" )
400
444
return StreamingResponse (content = generator , media_type = "text/event-stream" )
401
445
402
446
0 commit comments