mirror of
				https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
				synced 2025-11-04 06:16:05 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			51 lines
		
	
	
		
			961 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			961 B
		
	
	
	
		
			Python
		
	
	
	
	
	
from typing import Callable
 | 
						|
 | 
						|
from labml.configs import BaseConfigs, option
 | 
						|
 | 
						|
 | 
						|
class TokenizerConfigs(BaseConfigs):
 | 
						|
    """
 | 
						|
    <a id="TokenizerConfigs"></a>
 | 
						|
 | 
						|
    ## Tokenizer Configurations
 | 
						|
    """
 | 
						|
 | 
						|
    tokenizer: Callable = 'character'
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        super().__init__(_primary='tokenizer')
 | 
						|
 | 
						|
 | 
						|
@option(TokenizerConfigs.tokenizer)
 | 
						|
def basic_english():
 | 
						|
    """
 | 
						|
    ### Basic  english tokenizer
 | 
						|
 | 
						|
    We use character level tokenizer in this experiment.
 | 
						|
    You can switch by setting,
 | 
						|
 | 
						|
    ```
 | 
						|
    'tokenizer': 'basic_english'
 | 
						|
    ```
 | 
						|
 | 
						|
    in the configurations dictionary when starting the experiment.
 | 
						|
 | 
						|
    """
 | 
						|
    from torchtext.data import get_tokenizer
 | 
						|
    return get_tokenizer('basic_english')
 | 
						|
 | 
						|
 | 
						|
def character_tokenizer(x: str):
 | 
						|
    """
 | 
						|
    ### Character level tokenizer
 | 
						|
    """
 | 
						|
    return list(x)
 | 
						|
 | 
						|
 | 
						|
@option(TokenizerConfigs.tokenizer)
 | 
						|
def character():
 | 
						|
    """
 | 
						|
    Character level tokenizer configuration
 | 
						|
    """
 | 
						|
    return character_tokenizer
 |