mirror of
				https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
				synced 2025-10-31 18:58:43 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			51 lines
		
	
	
		
			961 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			961 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Callable
 | |
| 
 | |
| from labml.configs import BaseConfigs, option
 | |
| 
 | |
| 
 | |
| class TokenizerConfigs(BaseConfigs):
 | |
|     """
 | |
|     <a id="TokenizerConfigs"></a>
 | |
| 
 | |
|     ## Tokenizer Configurations
 | |
|     """
 | |
| 
 | |
|     tokenizer: Callable = 'character'
 | |
| 
 | |
|     def __init__(self):
 | |
|         super().__init__(_primary='tokenizer')
 | |
| 
 | |
| 
 | |
| @option(TokenizerConfigs.tokenizer)
 | |
| def basic_english():
 | |
|     """
 | |
|     ### Basic  english tokenizer
 | |
| 
 | |
|     We use character level tokenizer in this experiment.
 | |
|     You can switch by setting,
 | |
| 
 | |
|     ```
 | |
|     'tokenizer': 'basic_english'
 | |
|     ```
 | |
| 
 | |
|     in the configurations dictionary when starting the experiment.
 | |
| 
 | |
|     """
 | |
|     from torchtext.data import get_tokenizer
 | |
|     return get_tokenizer('basic_english')
 | |
| 
 | |
| 
 | |
| def character_tokenizer(x: str):
 | |
|     """
 | |
|     ### Character level tokenizer
 | |
|     """
 | |
|     return list(x)
 | |
| 
 | |
| 
 | |
| @option(TokenizerConfigs.tokenizer)
 | |
| def character():
 | |
|     """
 | |
|     Character level tokenizer configuration
 | |
|     """
 | |
|     return character_tokenizer
 | 
