summaryrefslogtreecommitdiff
path: root/counttokens
blob: e1c7442b59afd91f9e67c58de14fc37c748a032e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#! /usr/bin/env nix-shell
#! nix-shell -i python3 -p 'python3.withPackages (ps: with ps; [ transformers argparse ])'

import sys
import argparse
from transformers import AutoTokenizer

def main():
    parser = argparse.ArgumentParser(
        description="Tokenize input text using specified model tokenizer."
    )
    parser.add_argument(
        '-m', '--model',
        type=str,
        default='gpt2',
        help='Model name for tokenizer, e.g., "gpt2", "distilgpt2". Default is "gpt2".'
    )
    args = parser.parse_args()

    # Read text from standard input
    text = sys.stdin.read().strip()

    # Load tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    tokens = tokenizer.encode(text)

    print(len(tokens))

if __name__ == "__main__":
    main()