o
    i                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlZd dl	m
Z
 d dlmZ ddlmZ ddlmZmZ d	d
lmZmZmZ d	dlmZ eeZeG dd dZG dd deZG dd deZdS )    N)	dataclassfield)Enum)FileLock)Dataset   )PreTrainedTokenizerBase)check_torch_load_is_safelogging   )!glue_convert_examples_to_featuresglue_output_modesglue_processors)InputFeaturesc                   @   s   e Zd ZU dZeddde  idZe	e
d< eddidZe	e
d< ed	dd
idZee
d< edddidZee
d< dd ZdS )GlueDataTrainingArgumentsz
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
    line.
    helpz"The name of the task to train on: z, )metadata	task_namezUThe input data dir. Should contain the .tsv files (or other data files) for the task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.)defaultr   max_seq_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachec                 C   s   | j  | _ d S N)r   lowerself r   a/sda-disk/www/egybert/egybert_env/lib/python3.10/site-packages/transformers/data/datasets/glue.py__post_init__<   s   z'GlueDataTrainingArguments.__post_init__N)__name__
__module____qualname____doc__r   joinr   keysr   str__annotations__r   r   intr   boolr   r   r   r   r   r   "   s    
 $	r   c                   @   s   e Zd ZdZdZdZdS )SplittraindevtestN)r    r!   r"   r+   r,   r-   r   r   r   r   r*   @   s    r*   c                   @   s|   e Zd ZU eed< eed< ee ed< dej	dfdede
dedB deeB dedB f
d	d
Zdd ZdefddZdd ZdS )GlueDatasetargsoutput_modefeaturesN	tokenizerlimit_lengthmode	cache_dirc                 C   s&  t dt || _t|j  | _t|j | _t	|t
r-zt| }W n ty,   tdw tj|d ur6|n|jd|j d|jj d|j d|j }| j }|jdv ri|jjdv ri|d |d |d< |d< || _|d	 }t| tj|r|jst }	t  tj|d
d| _t d| dt |	  ndt d|j  |tj!kr| j"|j}
n|tj#kr| j$|j}
n| j%|j}
|d ur|
d | }
t&|
||j|| jd| _t }	t'| j| t d| dt |	 dd W d    d S W d    d S 1 sw   Y  d S )Na  This dataset will be removed from the library soon, preprocessing should be handled with the Hugging Face Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.pyzmode is not a valid split namecached__)mnlizmnli-mm)RobertaTokenizerXLMRobertaTokenizerBartTokenizerBartTokenizerFastr      z.lockT)weights_onlyz"Loading features from cached file z [took %.3f s]z'Creating features from dataset file at )
max_length
label_listr0   z!Saving features into cached file z [took z.3fz s])(warningswarnFutureWarningr/   r   r   	processorr   r0   
isinstancer&   r*   KeyErrorospathr$   r   value	__class__r    r   
get_labelsr@   r   existsr   timer	   torchloadr1   loggerinfor,   get_dev_examplesr-   get_test_examplesget_train_examplesr   save)r   r/   r2   r3   r4   r5   cached_features_filer@   	lock_pathstartexamplesr   r   r   __init__K   sj   
$



$zGlueDataset.__init__c                 C   s
   t | jS r   )lenr1   r   r   r   r   __len__      
zGlueDataset.__len__returnc                 C   s
   | j | S r   )r1   )r   ir   r   r   __getitem__   r]   zGlueDataset.__getitem__c                 C   s   | j S r   )r@   r   r   r   r   rK      s   zGlueDataset.get_labels)r    r!   r"   r   r'   r&   listr   r*   r+   r   r(   rZ   r\   r`   rK   r   r   r   r   r.   F   s*   
 
Jr.   )rG   rM   rA   dataclassesr   r   enumr   rN   filelockr   torch.utils.datar   tokenization_utils_baser   utilsr	   r
   processors.gluer   r   r   processors.utilsr   
get_loggerr    rP   r   r*   r.   r   r   r   r   <module>   s"   
