mirror of
https://github.com/labmlai/annotated_deep_learning_paper_implementations.git
synced 2025-08-14 17:41:37 +08:00
51 lines
961 B
Python
51 lines
961 B
Python
from typing import Callable
|
|
|
|
from labml.configs import BaseConfigs, option
|
|
|
|
|
|
class TokenizerConfigs(BaseConfigs):
|
|
"""
|
|
<a id="TokenizerConfigs"></a>
|
|
|
|
## Tokenizer Configurations
|
|
"""
|
|
|
|
tokenizer: Callable = 'character'
|
|
|
|
def __init__(self):
|
|
super().__init__(_primary='tokenizer')
|
|
|
|
|
|
@option(TokenizerConfigs.tokenizer)
|
|
def basic_english():
|
|
"""
|
|
### Basic english tokenizer
|
|
|
|
We use character level tokenizer in this experiment.
|
|
You can switch by setting,
|
|
|
|
```
|
|
'tokenizer': 'basic_english'
|
|
```
|
|
|
|
in the configurations dictionary when starting the experiment.
|
|
|
|
"""
|
|
from torchtext.data import get_tokenizer
|
|
return get_tokenizer('basic_english')
|
|
|
|
|
|
def character_tokenizer(x: str):
|
|
"""
|
|
### Character level tokenizer
|
|
"""
|
|
return list(x)
|
|
|
|
|
|
@option(TokenizerConfigs.tokenizer)
|
|
def character():
|
|
"""
|
|
Character level tokenizer configuration
|
|
"""
|
|
return character_tokenizer
|