Add the ability to encode and decode text to and from tokens
This commit is contained in:
parent
0d79fa2009
commit
ecd9c7eeee
28
tokenizer.py
28
tokenizer.py
@ -1,4 +1,4 @@
|
||||
from text import tokens
|
||||
from text import text, tokens
|
||||
|
||||
number_of_tokens = 256 # Number of possible integer values in a byte.
|
||||
|
||||
@ -31,7 +31,31 @@ def merge(tokens, number_of_merges = 20):
|
||||
merges[most_frequent_pair] = number_of_tokens + i
|
||||
return merged_tokens, merges
|
||||
|
||||
def encode(text, merges):
|
||||
"""Encode the text into a sequence of merged tokens."""
|
||||
tokens = list(text.encode('utf-8'))
|
||||
while len(tokens) > 1:
|
||||
pairs = get_pairs(tokens)
|
||||
pair = min(pairs, key = lambda pair: merges.get(pair, float('inf')))
|
||||
if pair not in merges:
|
||||
break # Nothing else to merge.
|
||||
token = merges[pair]
|
||||
tokens = replace_pair(tokens, pair, token)
|
||||
return tokens
|
||||
|
||||
def decode(merged_tokens, merges):
|
||||
"""Decode the merged tokens back into a UTF-8 string."""
|
||||
vocabulary = {token: bytes([token]) for token in range(number_of_tokens)}
|
||||
for (token1, token2), new_token in merges.items():
|
||||
vocabulary[new_token] = vocabulary[token1] + vocabulary[token2]
|
||||
tokens = b''.join([vocabulary[token] for token in merged_tokens])
|
||||
return tokens.decode('utf-8', errors = 'replace')
|
||||
|
||||
if __name__ == "__main__":
|
||||
merged_tokens, merges = merge(tokens)
|
||||
print('Merges:', merges)
|
||||
print('Compression Ratio:', len(tokens) / len(merged_tokens))
|
||||
print('Compression Ratio:', len(tokens) / len(merged_tokens))
|
||||
|
||||
encoded_text = encode(text, merges)
|
||||
decoded_text = decode(encoded_text, merges)
|
||||
print('Encoded = Decoded?', text == decoded_text)
|
Loading…
Reference in New Issue
Block a user