# MeJ Makers 24' Autumn School
# **AI Generative Algorithms - Learning week in Cluj**

## What is ChatGPT really doing when it talks to us -- *Adding One Word at a time*

### Installing pyTorch

In [None]:
## Standard libraries
import os
import math
import numpy as np
import time

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgba
import seaborn as sns
sns.set()

## Progress bar
from tqdm.notebook import tqdm

In [None]:
import torch
print("Using torch", torch.__version__)

## Importing GTP2 transformers

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

## The most probable next word...

### 1. What are the five more probable words after ...

In [None]:
input_text = "The best thing about AI is its ability to"
inputs = tokenizer.encode(input_text, return_tensors="pt")
with torch.no_grad():
  outputs = model(inputs)
predictions = outputs.logits[:, -1, :]
proba = torch.softmax(predictions, dim=-1)
top_proba, top_words = torch.topk(proba, k=5, dim=-1)
words = [tokenizer.decode(top_words[0, i]) for i in range(top_words.size(1))]
proba = [top_proba[0, i].item() for i in range(top_words.size(1))]
print(f"5 most probable words after '{input_text}' :")
for i in range(top_words.size(1)):
    print(f"- '{words[i]}': {proba[i]:.4f}")

### 2. Use barchart to represent them

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(words, proba)
plt.ylabel("Probability")
plt.title(f"Top 5 most probable words after '{input_text}'")
plt.show()


### 3. Do it 2 or 3 times starting with the text augmented by the previous most probable word

In [None]:
input_text = "The best thing about AI is its ability to learn"
inputs = tokenizer.encode(input_text, return_tensors="pt")
with torch.no_grad():
  outputs = model(inputs)
predictions = outputs.logits[:, -1, :]
proba = torch.softmax(predictions, dim=-1)
top_proba, top_words = torch.topk(proba, k=5, dim=-1)
words = [tokenizer.decode(top_words[0, i]) for i in range(top_words.size(1))]
proba = [top_proba[0, i].item() for i in range(top_words.size(1))]
print(f"5 most probable words after '{input_text}' :")
for i in range(top_words.size(1)):
    print(f"- '{words[i]}': {proba[i]:.4f}")

In [None]:
input_text = "The best thing about AI is its ability to learn from"
inputs = tokenizer.encode(input_text, return_tensors="pt")
with torch.no_grad():
  outputs = model(inputs)
predictions = outputs.logits[:, -1, :]
proba = torch.softmax(predictions, dim=-1)
top_proba, top_words = torch.topk(proba, k=5, dim=-1)
words = [tokenizer.decode(top_words[0, i]) for i in range(top_words.size(1))]
proba = [top_proba[0, i].item() for i in range(top_words.size(1))]
print(f"5 most probable words after '{input_text}' :")
for i in range(top_words.size(1)):
    print(f"- '{words[i]}': {proba[i]:.4f}")

### 4. Define a function that augment a given text by the next most probable word and test it on an exemple

In [None]:
def augment_text(input_text):
    """
    Augments the input text with the most probable next word predicted by the model.

    Args:
        input_text (str): The text to augment.

    Returns:
        str: The augmented text.
    """
    inputs = tokenizer.encode(input_text, return_tensors='pt')

    with torch.no_grad():
        outputs = model(inputs)

    next_token_logits = outputs.logits[:, -1, :]
    most_probable_token_id = torch.argmax(next_token_logits, dim=-1)
    most_probable_word = tokenizer.decode(most_probable_token_id)

    augmented_text = input_text + most_probable_word
    return augmented_text

In [None]:
input_text = "The best thing about AI is its ability to"
augmented_text = augment_text(input_text)
print(f"Original text: '{input_text}'")
print(f"Augmented text: '{augmented_text}'")

### 5. Augment a text sequentially of up to 12 words by repeating the use of the previous function

In [None]:
input_text = "The best thing about AI is its ability to"
augmented_text = input_text
for i in range(12):
    augmented_text = augment_text(augmented_text)
    print(f"Augmented text after {i+1} iterations: '{augmented_text}'")

### 6. Compute the probabilities at each step and show the corresponding barcharts in an array

In [None]:
def augment_text_with_most_probable_word(input_text, model, tokenizer):
    """
    Augments the input text with the most probable next word predicted by the model.

    Args:
        input_text (str): The text to augment.
        model: The loaded language model (e.g., GPT2LMHeadModel).
        tokenizer: The corresponding tokenizer (e.g., GPT2Tokenizer).

    Returns:
        str: The augmented text.
    """
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    with torch.no_grad():
        outputs = model(input_ids)

    next_token_logits = outputs.logits[:, -1, :]
    most_probable_token_id = torch.argmax(next_token_logits, dim=-1)
    most_probable_word = tokenizer.decode(most_probable_token_id)

    augmented_text = input_text + most_probable_word
    return augmented_text

def get_top_words_and_probabilities(input_text, model, tokenizer, k=5):
    """
    Gets the top k most probable next words and their probabilities for a given text.

    Args:
        input_text (str): The input text.
        model: The loaded language model.
        tokenizer: The corresponding tokenizer.
        k (int): The number of top words to retrieve.

    Returns:
        tuple: A tuple containing two lists: the top k words and their probabilities.
    """
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    with torch.no_grad():
        outputs = model(input_ids)

    next_token_logits = outputs.logits[:, -1, :]
    probabilities = torch.softmax(next_token_logits, dim=-1)
    topk_probabilities, topk_token_ids = torch.topk(probabilities, k=k, dim=-1)

    words = [tokenizer.decode(topk_token_ids[0, i]) for i in range(k)]
    probabilities = [topk_probabilities[0, i].item() for i in range(k)]

    return words, probabilities

# Augment text sequentially and collect data for plotting
input_text = "The best thing about AI is its ability to"
augmented_text = input_text
num_iterations = 12  # You can adjust the number of iterations
all_words = []
all_probabilities = []
texts_at_each_step = [input_text]

for i in range(num_iterations):
    words, probabilities = get_top_words_and_probabilities(augmented_text, model, tokenizer)
    all_words.append(words)
    all_probabilities.append(probabilities)
    augmented_text = augment_text_with_most_probable_word(augmented_text, model, tokenizer)
    texts_at_each_step.append(augmented_text)

# Plotting the bar charts in a grid
# Determine the number of columns for the grid (e.g., 3 columns)
n_cols = 3
n_rows = (num_iterations + n_cols - 1) // n_cols # Calculate number of rows needed

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4)) # Adjust figure size

# Flatten the axes array if there's only one row
if n_rows == 1:
    axes = [axes]

for i in range(num_iterations):
    row_index = i // n_cols
    col_index = i % n_cols
    ax = axes[row_index][col_index]

    ax.bar(all_words[i], all_probabilities[i])
    ax.set_ylabel("Probability")
    ax.set_title(f"Step {i+1}")
    ax.tick_params(axis='x', rotation=45)
    ax.tick_params(axis='x', labelsize=8) # Make x-axis labels smaller
    ax.tick_params(axis='y', labelsize=8) # Make y-axis labels smaller


# Hide any unused subplots
for i in range(num_iterations, n_rows * n_cols):
    row_index = i // n_cols
    col_index = i % n_cols
    fig.delaxes(axes[row_index][col_index])


plt.tight_layout()
plt.show()

## ...but not always

 ### 7. Generate sample of sentences several times. Change the temperature parameter to see what it is changing.

In [None]:
for i in range(5):
  input_text = "The best thing about AI is its ability to"
  num_words = 10
  inputs = tokenizer.encode(input_text, return_tensors='pt')
  max_length = len(inputs[0]) + num_words
  pad_token_id = tokenizer.eos_token_id
  temperature = 0.1
  output_sequence = model.generate(
    inputs,
    max_length=max_length,
    num_return_sequences=1,
    do_sample=True,
    temperature=temperature,
    pad_token_id=pad_token_id
  )
  generated_text_sequence = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
  print(f"{generated_text_sequence}")

### 8. Have a look at the 100 most probable words. Try a log-log plot.

In [None]:
input_text = "The best thing about AI is its ability to"
inputs = tokenizer.encode(input_text, return_tensors="pt")
with torch.no_grad():
  outputs = model(inputs)
predictions = outputs.logits[:, -1, :]
proba = torch.softmax(predictions, dim=-1)
top_proba, top_words = torch.topk(proba, k=100, dim=-1)
words = [tokenizer.decode(top_words[0, i]) for i in range(top_words.size(1))]
proba = [top_proba[0, i].item() for i in range(top_words.size(1))]
print(f"5 most probable words after '{input_text}' :")
for i in range(top_words.size(1)):
    print(f"- '{words[i]}': {proba[i]:.4f}")

In [None]:
plt.figure(figsize=(15, 8)) # Increased figure size for better label visibility
plt.loglog(range(1, len(proba) + 1), proba, marker='o', linestyle='-')
plt.xlabel("Rank of word (log scale)")
plt.ylabel("Probability (log scale)")
plt.title(f"Log-log plot of probabilities for top 100 words after '{input_text}'")
plt.grid(True, which="both", ls="-")

# Label the points with the words, only for the first 10 and then every 10th word
for i, (word, probability) in enumerate(zip(words, proba)):
    if i < 10 or (i + 1) % 10 == 0: # Label first 10 or every 10th word (using 1-based index for rank)
        plt.annotate(word, (i + 1, probability), textcoords="offset points", xytext=(0,10), ha='center', fontsize=8) # Adjust xytext and fontsize as needed

plt.show()