o
    i7                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dl	mZ d dlmZmZmZmZmZ d dlZd dlmZ d d	lmZ d d
l m Z  d dl!Z!d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 erd dl!m2Z2m3Z3m4Z4 ddl5m6Z6 e( rd dl7Z7e, rd dl8m9Z9 e* oe' oe+ oe) Z:e:rd dl;Z;d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZR d dlNmMZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk G dd  d egd!d"ZlG d#d$ d$eUd!d"ZmG d%d& d&eGd!d"ZnejelZoejemZpejenZqh d'Zrh d(Zsh d)Zte1uevZwd*d+d,d-iZxeyexz Z{d.Z|d/d0 Z}d1d2 Z~d3d4 ZG d5d6 d6ejZd7ed8e$d9e$fd:d;ZG d<d= d=ZG d>d? d?ZG d@dA dAZdBe_evdCkre ZdS dS )D    N)	GeneratorIterable)asynccontextmanager)	lru_cache)BytesIO)Thread)TYPE_CHECKING	AnnotatedOptional	TypedDictUnion)scan_cache_dir)DecodeStream)tqdm)AutoTokenizerBitsAndBytesConfigGenerationConfigPreTrainedTokenizerBase)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )LogitsProcessorListTextIteratorStreamer)logging)PreTrainedModelPreTrainedTokenizerFastProcessorMixin)ContinuousBatchingManager)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionChatCompletionMessageChatCompletionMessageParam)Choice)ChatCompletionChunkChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   @      e Zd ZU dZeed< dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__ rP   rP   X/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/cli/serve.pyrG   s      
 rG   F)totalc                   @   rF   )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rH   NrI   rP   rP   rP   rQ   rT   z   rR   rT   c                   @   s.   e Zd ZU dZeed< eed< dZeed< dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerH   FstreamN)	rJ   rK   rL   rM   bytesrO   rN   rW   boolrP   rP   rP   rQ   rU      s
   
 rU   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr[   audior\   logprobsmetadata	functions
modalities
predictionrb   rc   rd   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   r]   r^   languagerq   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                 C   s   dd l }||  d S Nr   )torchmanual_seed)_seedr~   rP   rP   rQ   set_torch_seed   s   r   c                  C   s$   dd l } | j r| j  d S d S r}   )r~   cudais_availableempty_cache)r~   rP   rP   rQ   reset_torch_cache   s   
r   c                 C   s   dd l }|| S r}   )r~   	ones_like)_input_tensorr~   rP   rP   rQ   torch_ones_like   s   
r   c                   @   s   e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rJ   rK   rL   r   r   r   r   rP   rP   rP   rQ   r      s
    r   reqmodel_generation_configreturnc                 K   sX  |  ddurtdi t| d }nt|}|jdi |}| D ]\}}|dur3t||| q%|  ddurBt	| d |_
|  ddurPt	| d |_
|  ddur^t| d |_|  ddurj| d |_|  ddurv| d |_|  ddurt| d |_t| d d	krd
|_|  ddurt| d |_|  ddurt| d  |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rH   Nmax_output_tokens
max_tokensfrequency_penalty
logit_biasrh   temperatureg        Ftop_pseedrP   )getr   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   r   )r   r   kwargsrH   non_standard_kwargskvrP   rP   rQ   !create_generation_config_from_req   s6   


r   c                   @   s    e Zd ZdZdd Zdd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 C   s   |    d S N)resetselfrP   rP   rQ   __init__)  s   zToolState.__init__c                 C   s   d| _ d| _d| _d| _dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   rP   rP   rQ   r   ,  s   
zToolState.resetN)rJ   rK   rL   rM   r   r   rP   rP   rP   rQ   r   &  s    r   c                   @   sR   e Zd ZdZ	ddddeded dB fdd	Zd
d Zdd Zdd Z	dd Z
dS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr   timeout_seconds	processor)r    r   c                 C   s>   || _ t|j| _|| _|| _t| j| j| _	| j	
  d S r   )r   rN   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr{   )r   r   r   r   rP   rP   rQ   r   :  s   zTimedModel.__init__c                 C   s*   | j   t| j| j| _ | j   dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r{   r   rP   rP   rQ   reset_timerG  s   
zTimedModel.reset_timerc                 C   sL   t | dr"| jdur$| `| `d| _d| _t  t  | j  dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   r   r   r   rP   rP   rQ   delete_modelM  s   zTimedModel.delete_modelc                 C   s4   | j dkr|   t| j d| j  d d S d S )Nr   z was removed from memory after z seconds of inactivity)r   r   loggerinfor   r   rP   rP   rQ   r   \  s   
zTimedModel.timeout_reachedc                 C   s   t | d p
| jdu S )z)Check if the instances have been deleted.r   N)r   r   r   rP   rP   rQ   
is_deletedc  s   zTimedModel.is_deletedr   )rJ   rK   rL   rM   r   r   r   r   r   r   r   rP   rP   rP   rQ   r   4  s    	

r   c                $   @   s  e Zd Z															dodeeejd	d
f deeejdd
f deedB ejdd
f deeejdd
f deedB ejdd
f deedB ejdd
f deeejdd
f deeejdd
f deeejdd
f deeejdd
f deedB ejdd
f deeejd d
f d!eeejd"d
f d#eedB ejd$d
f d%eeejd&d'd(f d)df d*d+Z	d,d- Z
d.d/ Zd0ed1ed2ed3efd4d5Zd0efd6d7Zd0efd8d9Zd0efd:d;Z	<							dpd=ed>edB d?edB d@edB dAedB dBee dB dCedB dDedE d)efdFdGZedHeeB d)efdIdJZeedqdKedB d)eeeef  fdLdMZdNed=ed)e e!B fdOdPZ"edqd?dQd)e#fdRdSZ$edTe#fdUdVZ%dNed)e e!B fdWdXZ&dNed)e'eddf fdYdZZ(dNed)efd[d\Z)dNed)e'eddf fd]d^Z*dNed)efd_d`Z+d)e,dB fdadbZ-dced)efdddeZ.dfefdgdhZ/dfed)e0di fdjdkZ1dfed)e0dl fdmdnZ2dS )rServeFautoN	localhost@  ,  r   continuous_batchingz8Whether to use continuous batching for chat completions.)helpdevicezgDevice to use for inference; will default to `auto` and place the model on an accelerator if available.dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.trust_remote_codez2Whether to trust remote code when loading a model.attn_implementationzWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.quantizationzAWhich quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'hostz$Interface the server will listen to.portzPort the server will listen to.model_timeoutz@Time in seconds after which a model will be removed from memory.	log_levelz8Logging level as a string. Example: 'info' or 'warning'.default_seedz1The default seed for torch, should be an integer.enable_corsztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.input_validationz+Whether to turn on strict input validation.force_modelzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.non_blockingTz/Whether to run the server in a separate thread.)hiddenr   r   c                    sJ  t std| _| _| _| _| _| _| _| _	|	 _
|
 _| _| _| _| _| _|d ur;t| td}|tj|
   td}|tj|
   i  _d  _d  _d  _d  _ j
d u ru jrrdnd _
 jr  j}| _ | tdtf fdd}t|d	} jr|j t!d
gdd
gd
gd t"#d ddl$m%} |&dd|dt'f fdd}|&ddt'f fdd}|&dd|f fdd}|(d|)d fdd}|)ddd  }|*d!d|fd"d#}t+j,| j j	 jd$}t+-| _. jr /  d S  j.0  d S )%NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`transformersz+transformers.generation.continuous_batchingr   appc                   sB   d V   j  D ]}|  q	 jd ur jjddd d S d S )NT   blocktimeout)loaded_modelsvaluesr   #running_continuous_batching_managerrh   )r   r   r   rP   rQ   lifespan  s   

z Serve.__init__.<locals>.lifespan)r   *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsrequestbodyc                    s,    j |d  jr || jjS  |S )Nr   ) validate_chat_completion_requestr   #continuous_batching_chat_completionstate
request_idgenerate_chat_completion)r   r   r   rP   rQ   chat_completion  s   
z'Serve.__init__.<locals>.chat_completionz/v1/responsesc                    sD    j | d | dd}|s | }t|S  | }t|ddS )Nr   rW   Ttext/event-stream
media_type)validate_response_requestr   generate_response_non_streamingr&   generate_responser'   )r   rW   response_objoutputr   rP   rQ   	responses  s   

z!Serve.__init__.<locals>.responsesz/v1/audio/transcriptionsc              
      s   |   4 I d H 5}t|d  I d H |d d}td|d j d|d j d|d jd dd	 W d   I d H  n1 I d H sDw   Y   j|d
  	|}t
|ddS )NrV   r   )rV   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr   r   r   )formrU   readr   debugfilenamecontent_typesizevalidate_transcription_requestgenerate_transcriptionr'   )r   r  parsed_requestr  r   rP   rQ   audio_transcriptions  s   (

z,Serve.__init__.<locals>.audio_transcriptionsz
/v1/modelsc                      s   t d  dS )Nlist)objectdata)r&   get_gen_modelsrP   r   rP   rQ   get_all_models  s   z&Serve.__init__.<locals>.get_all_modelsz/healthc                   S   s   t ddiS )Nstatusok)r&   rP   rP   rP   rQ   healthcheck  s   z#Serve.__init__.<locals>.healthcheckhttpc                    s>   | j tptt }|| j_|| I d H }||j t< |S r   )headersr   X_REQUEST_IDrN   uuiduuid4r   r   )r   	call_nextr   responserP   rP   rQ   get_or_set_request_id#  s   
z-Serve.__init__.<locals>.get_or_set_request_id)r   r   r   )1serve_dependencies_availableImportErrorr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
get_loggersetLevel
log_levelslowerr   r   last_messageslast_kv_cache
last_modelprocess_model_nameload_model_and_processorr   r#   add_middlewarer%   r   warning_oncefastapir   postdictoptionsr   
middlewareuvicornConfigServerserverstart_serverrun)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   transformers_logger	cb_loggermodel_id_and_revisionr   r   r   r   r  r  r  r  r  configrP   r   rQ   r   k  s   :





zServe.__init__c                    s,    fdd}t j|ddd _ j  d S )Nc                      s,   t   _t  j  j j  d S r   )asyncionew_event_loop_loopset_event_looprun_until_completer3  serverP   r   rP   rQ   _run4  s   
z Serve.start_server.<locals>._runzuvicorn-threadF)targetnamedaemon)r   r   _threadr{   )r   r@  rP   r   rQ   r4  3  s   zServe.start_serverc                 C   sR   | j std| j  stdd| j_| j r%| j  r'| j jdd d S d S d S )NzHThe server cannot be killed as it was not launched in a separate thread.zThe server is already killed.Tr   )r   )rD  
ValueErroris_aliver3  should_exitjoinr   rP   rP   rQ   kill_server=  s   
zServe.kill_serverr   schema	validatorunused_fieldsc           
   
   C   s   t d|  t| }|j}|| }|r(t d|  tdd| d| jrhz|| W n t	yP } zt d|
   td|
 dd}~ww ||@ }	|	rjt d|	  tdd|	 ddS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   r  setkeys__mutable_keys__errorr$   r   validate_pythonrE   errors)
r   r   rJ  rK  rL  
input_keyspossible_keysunexpected_keyseunused_fields_in_requestrP   rP   rQ   _validate_requestH  s.   

zServe._validate_requestc                 C      | j |tttd d S N)r   rJ  rK  rL  )rZ  rG   response_validatorUNUSED_RESPONSE_FIELDSr   r   rP   rP   rQ   r   y     
zServe.validate_response_requestc                 C   r[  r\  )rZ  rT   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr_  rP   rP   rQ   r     r`  z&Serve.validate_chat_completion_requestc                 C   r[  r\  )rZ  rU   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr_  rP   rP   rQ   r
    r`  z$Serve.validate_transcription_requestr   r   contentr   rolefinish_reason
tool_callsdecode_stream	tokenizerr   c	           
   
   C   s\   |dur|dur|dur| |j|}t|tt |tt|||dd|dgddd}	|	S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        N)re  rf  rh  r   )deltaindexrg  r   zchat.completion.chunk)idcreatedr   choicessystem_fingerprintr  )step
_tokenizerr.   r   timeChoiceChunkr/   )
r   r   re  r   rf  rg  rh  ri  rj  chunkrP   rP   rQ   build_chat_completion_chunk  s(   "
z!Serve.build_chat_completion_chunkru  c                 C   s   d| j dd dS )a/  
        Builds an event of a streaming OpenAI Response model or a ChatCompletion chunk.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            chunk (`BaseModel` or `ChatCompletionChunk`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        zdata: Texclude_nonez

)model_dump_json)ru  rP   rP   rQ   chunk_to_sse_element  s   zServe.chunk_to_sse_element	cache_dirc              	      s  ddl m}m} g }td tt| jD ]u}|jdkrq|j	}|
 D ]e\}}|j}tdd |D d}	|	s9q%t|	  }
t|
trKd|
v sLq%|
d }|  | t fd	d|D rd
|jv ro|jd
nd}|j|dkr|d| nd }|||d|jd q%q|S )z2
        List LLMs and VLMs in the cache.
        r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESz/Scanning the cache directory for LLMs and VLMs.r   c                 s   s     | ]}|j d kr|jV  qdS )zconfig.jsonN)	file_name	file_path).0frP   rP   rQ   	<genexpr>  s    z'Serve.get_gen_models.<locals>.<genexpr>Narchitecturesc                 3   s$    | ]}|g  v r|V  qd S r   rP   )r  archllmsvlmsrP   rQ   r    s   " /r   main@)owned_byrm  r  rn  )&transformers.models.auto.modeling_autor}  r~  r   warningr   r   repos	repo_typerefsr   filesnextr   r   openr  
isinstancer-  r   anyrepo_idsplitappendlast_modified)r{  r}  r~  generative_modelsrepor  refrevision_infor  config_pathr9  r  authorrepo_handlerP   r  rQ   r    s>   

zServe.get_gen_modelsr   c           
   	      sh   |d jk}_|r!jdur!jjddd d_\}}t|dr0|jn|t||jj	j
dddd	jdu rV|jd
_t j_j  |j|d dddd|jd d fddfdd fdd} fdd}jj|j|dd}|drt||ddS ||}|jdd}	t|	ddS )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r   rj  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rH   messagespt)return_tensorsadd_generation_promptreturn_dict	input_idsr   c           
   
   3   s   ddl m} z[j| ddV  d}j| D ]F}|d7 }|jr2|jd }j| ||dV  |j|jkr_| jk}t	d	rL|j
k}|oK| }|rPd
nd}j| |dV   W d S qW d S  ty }	 ztt|	 j|  dt|	 dV  W Y d }	~	d S d }	~	ww )Nr   )RequestStatus	assistantrf  r   r      r   )r   re  r   ri  rj  	eos_tokenlengthrh   rg  r   data: {"error": ""})generation.continuous_batchingr  rv  r   request_id_itergenerated_tokensr  FINISHEDr   r   r  	Exceptionr   rR  rN   cancel_request)
r   ri  r  n_tokens_generatedresulttoken_idgenerated_all_tokensfinal_token_is_eosreasonrX  )rH   r8  r   rj  rP   rQ   stream_chat_completion>  sH   




 zIServe.continuous_batching_chat_completion.<locals>.stream_chat_completionc                    sv   d }j  r|d u rj j| dd}j  r|d u s|j}t| tt d tdt	|ddddgd	}|S )
Nr  )r   r   chat.completionr   r  re  rf  rh   rl  messagerg  )rm  rn  r  r   ro  )
r   
is_running
get_resultdecoder  r*   r   rs  r-   r+   )_request_idr  re  chat_completion_result)r8  r   rj  rP   rQ   buffer_chat_completionk  s$   

zIServe.continuous_batching_chat_completion.<locals>.buffer_chat_completionc                   sx   z t   d}| |D ]}|V  tdI d H  qW d S  tjy;   j|  t	d|  d Y d S w )NFr   Request  was cancelled.)
r   tolistrz  r:  sleepCancelledErrorr   r  r   r  )r  ri  _chunk)inputsr   r  rP   rQ   cancellation_wrapper_stream  s   zNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_streamc                    s@   z | W S  t jy   j|  td|  d Y d S w )Nr  r  )r:  r  r   r  r   r  )r  )r  r   rP   rQ   cancellation_wrapper_buffer  s   
zNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_bufferrW   )r   r   	streamingr   r   rw  application/json)r'  r&  r   rh   r(  r   rj  r   rH   r  r  init_continuous_batchingr   logit_processorr{   apply_chat_templatetor   add_requestr   r   r'   ry  r&   )
r   r   r   must_discard_cacher   r   r  r  ru  
json_chunkrP   )r  rH   r  r8  r   r  rj  rQ   r     sV   






-
z)Serve.continuous_batching_chat_completionr   c                 C   sj   |d urt |trtjS ddlm}m} | jj}||	 v r#tj
}|S ||	 v r.tj}|S td| )Nr   r|  zUnknown modality: )r  r   r   r   r  r}  r~  	__class__rJ   r   r   rE  )r   r   r}  r~  model_classnamemodalityrP   rP   rQ   get_model_modality  s   
zServe.get_model_modalityr  c                 C   s~  g }| D ]}|d g d}|t jkrEt|d tr|d }n"t|d tr@g }|d D ]}|d dkr:||d  q+d|}||d< nr|t jkrt|d tr^|d d|d d nY|d D ]T}|d dkrr|d | qb|d dkrd	|d d
 v rt	dd|d d
 }t
tt|}tjddd}	|	j}
||	j n|d d
 }
|d d|
d qb|| q|S )Nrf  rf  re  re  typerZ    )r  rZ   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   r  rN   r  r  rH  r   resubr"   r  r   r  	b64decodetempfileNamedTemporaryFilerB  save)r  r  processor_inputsr  parsed_messageparsed_contentre  
image_datar  rV   r  rP   rP   rQ   *get_processor_inputs_from_inbound_messages  s@   




z0Serve.get_processor_inputs_from_inbound_messagesc                    sD  j dur
j |d< |d }|d d dkrdS |d jk}_\}j|d}||}dtD ]}|jjd 	 v rO| nq?|j
|d	|d
dd	d	d}|j}|ddd	}	djjd 	 v rxd}	t||	d	d}
t|jd d}|r|sj }|d jd |krj}i ||
 d	|d fdd}|drttj||
ddS g }d}||
}d}|D ]$}|jd }t|jddr||jj |jr|j}t|ddr|j}qtt t!! dt"dt#d$|dd|d g|d!}|j%d	d"}t&|d#dS )$a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r   rf  r  )r   r   Ttoolsr  )r  r  r  r  tokenizer   req_0gptossFskip_special_tokensskip_promptr   r  )streamerrH   return_dict_in_generatepast_key_valuesc              
   3   s   d}d }dj jd  v rd}d}fdd}t|d}d	}zMz|  t }jd
dV  d	}d}	| D ]}|	d7 }	dj jd  v rQ|d}||7 }|r_||v r^d}q<q<d ur| t	 d krrd|_
q<| t	 d kr|  j|d ddV  q<|j
r| j|7  _|jstd|j}
|
d u rq<|
d}
d|_tt|
ddd|d d}n<|d	krq<d|jvrq<| j|d7  _| j|d8  _|jdk rd	|dd d d }tt|dddd}j|d |gdV  q<|d	krj||dV  q<|	 jk}t| jdr*|| jjk}|o)| }|r/d nd!}j||d"V  |  W n# tyb } ztt| d#t| d$V  W Y d }~nd }~ww W |  d S W |  d S |  w )%NFr  r   T<|channel|>final<|message|>c                         j di | }|j_d S NrP   generater  r%  r   generate_outputr   r   rP   rQ   generate_with_cache?     z[Serve.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cacherA  r   r   r  r  r  
<|return|>r{   r|   rh  )r   rf  rg  r   z\"name\": \"(.*?)\")rB  function
_tool_call)r  rl  r  rm  z"arguments": {{})	arguments)r  rl  r  )r   rf  rh  r   )re  r   r  r  rh   r  r  r  )r9  r  r#  r   r{   r   rv  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr0   r1   r   countrH  r  r   r   rj  r  r  r   rR  rN   )r  r  
filter_cotcot_trace_endr
  threadresults
tool_stater  r  	tool_nametoolr  r  r  rX  rH   generation_kwargsr   r8  r   r   tool_model_familyrP   rQ   r  5  s   






z>Serve.generate_chat_completion.<locals>.stream_chat_completionrW   r   r   rh   re  usager  r   r  r  )rm  rn  r  r   ro  r$  rw  r  )'r   r'  r&  r(  r  r  _MODELS_WITH_TOOL_SUPPORTr9  r  r#  r  r   r  r   r   r   rH   is_continuationr%  get_seq_lengthshaper'   maprz  ro  getattrrk  r  re  rg  r$  r*   r   rs  r-   r+   rH  
model_dumpr&   )r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerr%  seq_lenr  re  rg  	generatorr$  ru  choicer  r  rP   r!  rQ   r     s   



 



	zServe.generate_chat_completionc                    s   d jk}_\}td tr6dv r)dd dgng }|dd d nUtd trjdv red d d dkrXdd dgd }n3d }d |d d	< n&d }n!td trdv r}dd dgng }|d  ntd
|j	|ddddd }|
j}ddd}djjd  v rd}t||dd}tjd}d}r|s؈j }	|d jd |	kr؈j}|t|||d|d  fdd}
|
|S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  r[   r   rf  re  %inputs should be a list, dict, or strTr  r  r  r  r  rf   r  r  Fr  r  Nr   )r  attention_maskr  rH   r   r  c                 3   s\   d}d }dj jd  v rd}d}fdd}t| d}d}d}d}zz|  t }	td	|td
 |	dddddiidg g dddddd}
|d7 }	|
V  t
d|td
 |	dddddiidg g dddddd}|d7 }	|V  td||td dddg dd}|d7 }	|V  tdd |||td d!g d"d#}|d7 }	|V  d!}| D ]U}dj jd  v r|d$}||7 }|r||v rd}d!}qtd%d ||||g d&}|d7 }	|V  q|r#td%d ||||g d&}|d7 }	|V  qtd'd ||d|g d(}|d7 }	|V  td)d |||td |jg d"d#}|d7 }|d7 }	|V  td*||td dd+d|jgg d,d}|d7 }|d7 }	|V  td-|td
 |	d+ddddii|jgdg ddddd.d}|d7 }	|V  |  W nc ty } zVtd/t|  td0|t|d1}|d7 }	|V  td2|td
 |	d3ddddiig dg dddtd4t|d5d6d}|d7 }	|V  W Y d }~nd }~ww W |  d S W |  d S |  w )7NFr  r   Tr  c                     r  r  r  r  r	  rP   rQ   r
  8  r  zMServe.generate_response.<locals>.stream_response.<locals>.generate_with_cacher  zresponse.createdresp_queuedr2  formatr  rZ   r  ru   r   rk   )rm  
created_atr  r   r2  rZ   r  r  r  ru   rb   rk   )r  sequence_numberr  r  zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )rm  r  r  rf  re  )r  r;  output_indexitemzresponse.content_part.addedoutput_textr   r  rZ   annotations)r  item_idr;  r>  content_indexpartr  zresponse.output_text.delta)r  rC  r;  r>  rD  rk  rj   zresponse.output_text.done)r  rC  r;  r>  rD  rZ   rj   zresponse.content_part.donezresponse.output_item.done	completedrm  r  r  rf  re  rB  zresponse.completedrm  r:  r  r   r2  rZ   r  r  r  ru   rb   rk   z"Exception in response generation: rR  )r  r;  r  zresponse.failedfailedserver_error)coder  )rm  r:  r  r   r2  rZ   r  r  r  ru   rb   rk   rR  ) r9  r  r#  r   r{   rs  r7   r3   r   rz  r;   r<   r>   r5   r?   r  r@   rA   r6   rZ   r=   rE  r4   r?  rH  r  r   rR  rN   r9   r:   r8   )r  r  r  r  r
  r  r;  r>  rD  r:  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedrX  error_eventresponse_failedr"  r   r8  r   r   r   rP   rQ   stream_response.  s  




			


%z0Serve.generate_response.<locals>.stream_response)r'  r&  r(  r  rN   r  r  r-  	TypeErrorr  r  r   r   r9  r  r#  r   r   rH   r&  r%  r'  r(  r   )r   r   r  r   r  r  r-  rH   r%  r.  rX  rP   rW  rQ   r     sb   


	 
ozServe.generate_responsec                 C   sP  |  |d }|| jk}|| _| |\}}t|d tr6d|v r)d|d dgng }|d|d d nUt|d trjd|v re|d d d dkrXd|d dg|d }n3|d }|d |d d	< n&|d }n!t|d trd|v r}d|d dgng }||d  ntd
|j	|ddddd }|
|j}|dd}d}d|jjd  v rd}t||jd}	d}
| |r|s| j }|jd |kr| j}
|j|t||	d|
d}|j| _|j|j|dd }t }td| dddtd|g dgg d}td| |d||dd d!d"ii|gd#g |d$dd%|d&d'}|jdd(S ))a  
        Generates an OpenAI Response in non-streaming mode (single JSON payload).

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `dict`: The OpenAI `Response` serialized as a dict.
        r   r1  r2  r3  r  r[   r   rf  re  r4  Tr  r5  r  rf   r  r  Fr  Nr   )r  r6  rH   r   r  r  r=  r  rF  r  r@  rA  rG  r7  r9  r  rZ   r  ru   r   rk   rH  rw  ) r'  r&  r(  r  rN   r  r  r-  rE  r  r  r   r   r9  r  r#  r   rH   r&  r%  r'  r(  r  r   r  batch_decode	sequencesrs  r>   r?   r3   r+  )r   r   r8  r  r   r   r  r   r  rH   r%  r.  r  	full_textr:  response_output_itemrT  rP   rP   rQ   r     s   





z%Serve.generate_response_non_streamingc           
         s   t  std| |d }| |\tjddd}t|jd}jj	}t
|d }tj||dd\}}||dd	j  d
 j d
< ||dd fdd}	|	 S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  rV   )srmonor  )sampling_rater  input_features)r  rH   r   c                  3   sH    j di  } j| jddd }t|d}|jdd V  d S )NTrZ  r   )rZ   rw  rP   )r  r[  r\  r(   ry  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr"  rP   rQ   _generate_transcription  s
   
z=Serve.generate_transcription.<locals>._generate_transcription)r   r  r'  load_audio_model_and_processorr   rj  r   rH   feature_extractorra  ior   librosaloadr  r   r   )
r   r   r8  r-  rH   model_sampling_rateaudio_bytesaudio_array_rj  rP   rf  rQ   r  x  s2   zServe.generate_transcriptionc                 C   sx   | dp	| d}d}| jdu rd}n#t| jt|kr d}ntt| jD ]}| j| || kr6d} nq'|| _|S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r1  TNF)r   r$  lenrange)r   r   r  req_continues_last_messagesirP   rP   rQ   r&    s   
zServe.is_continuationc                 C   sP   | j dkrtdddd}n| j dkrtdd}nd}|dur&td|  |S )	z
        Returns the quantization config for the given CLI arguments.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        zbnb-4bitTnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantzbnb-8bit)load_in_8bitNz0Quantization applied with the following config: )r   r   r   r   )r   quantization_configrP   rP   rQ   get_quantization_config  s   

zServe.get_quantization_configmodel_idc                 C   s&   | j dur| j }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        Nr  z@main)r   )r   r  rP   rP   rQ   r'    s
   

zServe.process_model_namer8  c                 C   sl  ddl }ddlm}m} td|  d|v r!|dd\}}n|d}}z|j||| jd}W n  t	yQ   zt
j||| jd}W n t	yN   t	d	w Y nw | jd
v rZ| jnt|| j}|  }	|| j|| j| j|	d}
|j|fi |
}tt|jd }|j|fi |
}|jjdu o|jjdk}|jjduo|jjdk }|s|rd|j_td|  ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        r   N)
AutoConfigAutoProcessorzLoading r  r  r  )revisionr   zBFailed to load processor with `AutoProcessor` and `AutoTokenizer`.)r   N)r  r   r   
device_mapr   r}     r  zLoaded model )r~   r   r  r  r   r   r  from_pretrainedr   OSErrorr   r   r*  r~  r   r   r  rH   r   
max_length)r   r8  r~   r  r  r  r  data_processorr   r}  model_kwargsr9  architecturer   has_default_max_lengthhas_short_max_new_tokensrP   rP   rQ   _load_model_and_data_processor  sX   



	z$Serve._load_model_and_data_processor)r   r   c                 C   r   || j vs| j |  r"| |\}}t|| j|d| j |< ||fS | j |   | j | j}| j | j}||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   r   r   r  r   r   r   r   r   )r   r8  r   r   rP   rP   rQ   r(  4  s   
zServe.load_model_and_processor)r   r    c                 C   r  )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   r8  rh  ri  rP   rP   rQ   rk  O  s   
z$Serve.load_audio_model_and_processor)Fr   r   FNNr   r   r   r   NFFNF)r   NNNNNNNr   )3rJ   rK   rL   r	   rY   typerOptionrN   r   r   r4  rI  r-  r   rD   rO  rZ  r   r   r
  r  r0   r   r
   r.   rv  staticmethodrC   rz  r   r  r  r'   r&   r   r   r  r  r   r   r   r   r  r&  r   r~  r'  r  tupler(  rk  rP   rP   rP   rQ   r   h  sL   
 #&)/069
 I

1

	

:(/ -    5Z0F
r   a  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.

Models will be loaded and unloaded automatically based on usage and a timeout.


The server will expose the following endpoints:
    - POST /v1/chat/completions: Generates chat completions.
    - POST /v1/responses: Generates responses.
    - POST /v1/audio/transcriptions: Generates transcriptions from audio.
    - GET /v1/models: Lists available models for 3rd party tools.

Requires FastAPI and Uvicorn to be installed.
__main__)r:  r  r   enumr   rm  r   r  r  r   rs  r  collections.abcr   r   
contextlibr   	functoolsr   r   r   typingr   r	   r
   r   r   r  huggingface_hubr   tokenizers.decodersr   r   r   r   r   r   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   utilsr   r   r   r    r  r!   rn  PILr"   r  r0  r+  r#   r$   fastapi.middleware.corsr%   fastapi.responsesr&   r'    openai.types.audio.transcriptionr(   .openai.types.audio.transcription_create_paramsr)   openai.types.chatr*   r+   r,   !openai.types.chat.chat_completionr-   'openai.types.chat.chat_completion_chunkr.   r/   r0   r1   rt  *openai.types.chat.completion_create_paramsr2   openai.types.responsesr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   -openai.types.responses.response_create_paramsrB   pydanticrC   rD   rE   rG   rT   rU   r]  ra  rc  r^  rb  rd  r   rJ   r   r  r  rP  r%  r  r   r   r   Enumr   r-  r   r   r   r   rM   r?  rP   rP   rP   rQ   <module>   s    	D



;4            

