o
    >e,                     @   sL   d dl Z d dlZd dlmZmZ ddlmZmZ edZ	G dd dZ
dS )    N)OptionalUnion   )LanguageFilterProbingStates%   [a-zA-Z]*[-]+[a-zA-Z]*[^a-zA-Z-]?c                   @   s   e Zd ZdZejfdeddfddZdddZede	e
 fd	d
Zede	e
 fddZdeeef defddZedefddZdefddZedeeef defddZedeeef defddZedeeef defddZdS )CharSetProbergffffff?lang_filterreturnNc                 C   s$   t j| _d| _|| _tt| _d S )NT)	r   	DETECTING_stateactiver   logging	getLogger__name__logger)selfr    r   V/var/www/html/humari/django-venv/lib/python3.10/site-packages/chardet/charsetprober.py__init__,   s   zCharSetProber.__init__c                 C   s   t j| _d S N)r   r
   r   r   r   r   r   reset2   s   zCharSetProber.resetc                 C   s   d S r   r   r   r   r   r   charset_name5      zCharSetProber.charset_namec                 C      t r   NotImplementedErrorr   r   r   r   language9   r   zCharSetProber.languagebyte_strc                 C   r   r   r   )r   r   r   r   r   feed=      zCharSetProber.feedc                 C   s   | j S r   )r   r   r   r   r   state@   s   zCharSetProber.statec                 C   s   dS )Ng        r   r   r   r   r   get_confidenceD   r    zCharSetProber.get_confidencebufc                 C   s   t dd| } | S )Ns   ([ -])+    )resub)r#   r   r   r   filter_high_byte_onlyG   s   z#CharSetProber.filter_high_byte_onlyc                 C   sZ   t  }t| }|D ] }||dd  |dd }| s%|dk r%d}|| q
|S )u7  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [-ÿ]
        marker: everything else [^a-zA-Z-ÿ]
        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.
        This filter applies to all scripts which do not use English characters.
        N   r$   )	bytearrayINTERNATIONAL_WORDS_PATTERNfindallextendisalpha)r#   filteredwordsword	last_charr   r   r   filter_international_wordsL   s   
z(CharSetProber.filter_international_wordsc                 C   s   t  }d}d}t| d} t| D ])\}}|dkr!|d }d}q|dkr;||kr9|s9|| ||  |d d}q|sG|| |d	  |S )
a[  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   c   >r      <r$   TN)r*   
memoryviewcast	enumerater-   )r#   r/   in_tagprevcurrbuf_charr   r   r   remove_xml_tagsn   s"   	
zCharSetProber.remove_xml_tags)r	   N)r   
__module____qualname__SHORTCUT_THRESHOLDr   NONEr   r   propertyr   strr   r   r   bytesr*   r   r   r!   floatr"   staticmethodr'   r3   r>   r   r   r   r   r   (   s$    
! r   )r   r%   typingr   r   enumsr   r   compiler+   r   r   r   r   r   <module>   s   