Files
2021-10-22 18:34:59 +05:30

51 lines
961 B
Python

from typing import Callable
from labml.configs import BaseConfigs, option
class TokenizerConfigs(BaseConfigs):
"""
<a id="TokenizerConfigs"></a>
## Tokenizer Configurations
"""
tokenizer: Callable = 'character'
def __init__(self):
super().__init__(_primary='tokenizer')
@option(TokenizerConfigs.tokenizer)
def basic_english():
"""
### Basic english tokenizer
We use character level tokenizer in this experiment.
You can switch by setting,
```
'tokenizer': 'basic_english'
```
in the configurations dictionary when starting the experiment.
"""
from torchtext.data import get_tokenizer
return get_tokenizer('basic_english')
def character_tokenizer(x: str):
"""
### Character level tokenizer
"""
return list(x)
@option(TokenizerConfigs.tokenizer)
def character():
"""
Character level tokenizer configuration
"""
return character_tokenizer