summaryrefslogtreecommitdiff
path: root/counttokens
diff options
context:
space:
mode:
Diffstat (limited to 'counttokens')
-rwxr-xr-xcounttokens30
1 files changed, 30 insertions, 0 deletions
diff --git a/counttokens b/counttokens
new file mode 100755
index 0000000..e1c7442
--- /dev/null
+++ b/counttokens
@@ -0,0 +1,30 @@
+#! /usr/bin/env nix-shell
+#! nix-shell -i python3 -p 'python3.withPackages (ps: with ps; [ transformers argparse ])'
+
+import sys
+import argparse
+from transformers import AutoTokenizer
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Tokenize input text using specified model tokenizer."
+ )
+ parser.add_argument(
+ '-m', '--model',
+ type=str,
+ default='gpt2',
+ help='Model name for tokenizer, e.g., "gpt2", "distilgpt2". Default is "gpt2".'
+ )
+ args = parser.parse_args()
+
+ # Read text from standard input
+ text = sys.stdin.read().strip()
+
+ # Load tokenizer for the specified model
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
+ tokens = tokenizer.encode(text)
+
+ print(len(tokens))
+
+if __name__ == "__main__":
+ main()