blob: e1c7442b59afd91f9e67c58de14fc37c748a032e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
#! /usr/bin/env nix-shell
#! nix-shell -i python3 -p 'python3.withPackages (ps: with ps; [ transformers argparse ])'
import sys
import argparse
from transformers import AutoTokenizer
def main():
parser = argparse.ArgumentParser(
description="Tokenize input text using specified model tokenizer."
)
parser.add_argument(
'-m', '--model',
type=str,
default='gpt2',
help='Model name for tokenizer, e.g., "gpt2", "distilgpt2". Default is "gpt2".'
)
args = parser.parse_args()
# Read text from standard input
text = sys.stdin.read().strip()
# Load tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(args.model)
tokens = tokenizer.encode(text)
print(len(tokens))
if __name__ == "__main__":
main()
|