Author Archives: kang & atul

Custom Agent with Memory in Langchain

Welcome to our tutorial on building a custom QnA agent with memory, using Wikipedia as the information source! In this code, we dive deep into the process of creating an intelligent agent that can remember previous interactions, providing more accurate and contextually relevant answers over time.

Import LLM

from langchain_openai import ChatOpenAI
import os

llm = ChatOpenAI(model="gpt-3.5-turbo", 
                 temperature=0, 
                 openai_api_key=os.environ.get("OPENAI_API_KEY"))

from langchain_openai import ChatOpenAI

import os

llm = ChatOpenAI(model="gpt-3.5-turbo",

temperature=0,

openai_api_key=os.environ.get("OPENAI_API_KEY"))

Import Tool

from langchain.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
wikipedia.run("Highest goals in a single season of La Liga")

from langchain.tools import WikipediaQueryRun

from langchain_community.utilities import WikipediaAPIWrapper

wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())

wikipedia.run("Highest goals in a single season of La Liga")

Bind Tool with LLM

tools = [wikipedia]
llm_with_tools = llm.bind_tools(tools)

1 2	tools = [wikipedia] llm_with_tools = llm.bind_tools(tools)

Create Prompt for LLM

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are very powerful assistant, but bad at calculating lengths of words.",
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(

[

(

"system",

"You are very powerful assistant, but bad at calculating lengths of words.",

MessagesPlaceholder(variable_name="chat_history"),

("user", "{input}"),

MessagesPlaceholder(variable_name="agent_scratchpad"),

]

)

Create Custom Agent with Memory

from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser


agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
        "chat_history": lambda x: x["chat_history"],
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

from langchain.agents.format_scratchpad.openai_tools import (

format_to_openai_tool_messages,

)

from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser

agent = (

{

"input": lambda x: x["input"],

"agent_scratchpad": lambda x: format_to_openai_tool_messages(

x["intermediate_steps"]

"chat_history": lambda x: x["chat_history"],

}

| prompt

| llm_with_tools

| OpenAIToolsAgentOutputParser()

)

from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, stream_runnable = False)

from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, stream_runnable = False)

from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    out = agent_executor.invoke({"input": "Highest goals in a single season of La Liga", "chat_history": chat_history})
print(out)
print(cb)

from langchain.callbacks import get_openai_callback

with get_openai_callback() as cb:

out = agent_executor.invoke({"input": "Highest goals in a single season of La Liga", "chat_history": chat_history})

print(out)

print(cb)

chat_history.extend(
    [
        HumanMessage(content="Highest goals in a single season of La Liga"),
        AIMessage(content=out["output"]),
    ]
)

print(chat_history)
agent_executor.invoke({"input": "How many goals he has scored overall in L Liga?", "chat_history": chat_history})

chat_history.extend(

[

HumanMessage(content="Highest goals in a single season of La Liga"),

AIMessage(content=out["output"]),

]

)

print(chat_history)

agent_executor.invoke({"input": "How many goals he has scored overall in L Liga?", "chat_history": chat_history})

Showcase in Gradio

import gradio as gr

agent_history = []
def call_agent(query, chat_history):
    print("Chat history : ", chat_history)
    output = agent_executor.invoke({"input": query, "chat_history": agent_history})

    agent_history.extend(
    [
        HumanMessage(content="Highest goals in a single season of La Liga"),
        AIMessage(content=out["output"]),
    ]
    )


    chat_history += [
        [
            "<b>Question: </b>" + query,
            "<b>Answer: </b>" + output["output"]
        ]
    ]


    return output["output"], chat_history

import gradio as gr

agent_history = []

def call_agent(query, chat_history):

print("Chat history : ", chat_history)

output = agent_executor.invoke({"input": query, "chat_history": agent_history})

agent_history.extend(

[

HumanMessage(content="Highest goals in a single season of La Liga"),

AIMessage(content=out["output"]),

]

)

chat_history += [

[

"<b>Question: </b>" + query,

"<b>Answer: </b>" + output["output"]

]

return output["output"], chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label = "QnA with Wikipedia")
    question = gr.Textbox(label = "Ask you query here")

    with gr.Row():
        submit = gr.Button("Submit")
        clear = gr.ClearButton([chatbot, question])

    def user(user_message, history):

        bot_message, history = call_agent(user_message, history)

        return "", history

    question.submit(user, [question, chatbot], [question, chatbot], queue=False)
    submit.click(user, [question, chatbot], [question, chatbot], queue=False)
    
demo.queue()
demo.launch()

with gr.Blocks() as demo:

chatbot = gr.Chatbot(label = "QnA with Wikipedia")

question = gr.Textbox(label = "Ask you query here")

with gr.Row():

submit = gr.Button("Submit")

clear = gr.ClearButton([chatbot, question])

def user(user_message, history):

bot_message, history = call_agent(user_message, history)

return "", history

question.submit(user, [question, chatbot], [question, chatbot], queue=False)

submit.click(user, [question, chatbot], [question, chatbot], queue=False)

demo.queue()

demo.launch()

So this is how you can create your own custom agent with memory in Langchain. Hope you enjoy reading. If you have any doubt/suggestion please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

Advanced Tokenization Techniques in NLP

Leave a reply

In Natural Language Processing (NLP), the way we represent text has a profound impact on the performance of our models. Tokenization, the process of breaking down text into smaller manageable units called tokens, is a foundational step in preparing text data for NLP tasks. While simple techniques like word-level tokenization exist, advanced methods like Byte Pair Encoding (BPE), SentencePiece, and WordPiece offer advantages, particularly when dealing with large vocabularies and out-of-vocabulary (OOV) words. Let’s delve into these techniques and understand their nuances.

What is Tokenization?

Tokenization is the process of segmenting a piece of text into smaller units called tokens. These tokens can range from:

Words: “The cat sat on the mat.” -> [“The”, “cat”, “sat”, “on”, “the”, “mat”]
Characters: “NLP is cool!” -> [“N”, “L”, “P”, ” “, “i”, “s”, ” “, “c”, “o”, “o”, “l”, “!”]
Subwords: “understandable” -> [“under”, “##stand”, “##able”]

Why Tokenization Matters

Manageable Units: NLP models generally don’t work directly with raw text. Tokens provide a structured representation for models to process.
Vocabulary Size: Tokenization techniques can influence the vocabulary size of your model, directly impacting memory usage and computational efficiency.

Advanced Tokenization Techniques

Let’s explore some sophisticated tokenization techniques frequently used in modern NLP models:

Byte Pair Encoding: BPE is a data compression technique adapted for NLP. It works as follows:
Initialization: Starts with a vocabulary of individual characters.
Iterative Merging: The most frequent pair of consecutive characters is identified and merged into a new symbol. This process is repeated until a desired vocabulary size is reached.
Example:
Initial vocabulary: ['a', 'b', 'd', 'e', 'g']
Most frequent pair: 'e', 'g' -> merge into 'eg'
New vocabulary: ['a', 'b', 'd', 'eg']
Advantage: BPE effectively handles rare and out-of-vocabulary words by representing them as sequences of subword tokens.

SentencePiece: SentencePiece builds upon BPE but has a key distinction—it treats the input text as a stream of Unicode characters without predefined word boundaries. This makes it language-independent and robust to different writing systems.
Advantage: SentencePiece is particularly useful for languages that don’t have clear-cut word boundaries, such as Chinese or Japanese.

WordPiece: WordPiece is similar to BPE but uses a probabilistic approach to select the best subword merges. It aims to produce subwords that are meaningful from a linguistic perspective.
Advantage: WordPiece often results in more intuitive subword units compared to BPE.

Comparison

Technique	Description	Pros	Cons
BPE	Iteratively merges frequent character pairs	Handles OOV words, language-agnostic	Can produce less intuitive subwords
SentencePiece	BPE-like, operates on raw Unicode text	Handles languages without word boundaries	Can be slightly slower than BPE
WordPiece	Probabilistic version of BPE	More linguistically meaningful subwords	A bit more computationally intensive

Let’s See Them in Action! (Example in Python using a hypothetical tokenizer)

tokenizer = Tokenizer(method='bpe', vocab_size=5000) 

text = "Tokenization is a fascinating process!"
tokens = tokenizer.encode(text)
print(tokens) 
# Output: ['Token', '##ization', ' is', ' a', ' fascin', '##ating', ' pro', '##cess', '!']

tokenizer = Tokenizer(method='bpe', vocab_size=5000)

text = "Tokenization is a fascinating process!"

tokens = tokenizer.encode(text)

print(tokens)

# Output: ['Token', '##ization', ' is', ' a', ' fascin', '##ating', ' pro', '##cess', '!']

Choosing the Right Technique

The best tokenization technique depends on your dataset, language, and the specific NLP task you are tackling. Consider experimenting to find what works optimally for your needs!

If you have any doubt/suggestion please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

Quantization in Large language models

Leave a reply

Quantization is a process used in machine learning and signal processing to reduce the precision or number of bits used to represent numerical values. The goal is to compress the data or model parameters, leading to reduced storage requirements, faster computation, and lower memory bandwidth. In the context of large language models (LLMs) like GPT, quantization can be applied to both the model weights and activations. In essence, it involves replacing high-precision data used in the model’s weights and activations with lower-precision alternatives. This leads to several benefits:

Smaller model size: Quantization can shrink LLM size by up to 90%, making them easier to store, transfer, and deploy on resource-constrained devices.
Faster inference: Lower-precision operations are faster to perform on hardware, leading to quicker predictions and responses from the LLM.
Lower energy consumption: Smaller models and faster computations translate to reduced energy usage, making LLMs more environmentally friendly.

Here are some common types of quantization techniques used with LLMs:

Weight Quantization:
- This involves reducing the number of bits used to represent the model weights. For example, instead of using 32-bit floating-point numbers, weights can be quantized to 8-bit integers. This reduces the memory footprint and allows for more efficient storage and computation.
Activation Quantization:
- Activation quantization focuses on reducing the precision of the intermediate values (activations) during the forward pass of the neural network. Similar to weight quantization, this can involve representing activations with fewer bits, leading to reduced memory requirements and faster computations.
Fixed-Point Quantization:
- In fixed-point quantization, the range of possible values is divided into fixed intervals. This is in contrast to floating-point representations, where the position of the decimal point can vary. Fixed-point quantization is computationally more efficient but may have limitations in representing a wide range of values with high precision.
Dynamic Quantization:
- Dynamic quantization adapts the precision of the quantized values dynamically during runtime. It allows for better representation of the distribution of values encountered during inference. This technique is useful when the range of values in the model varies widely across different layers.
Vector Quantization:
- Vector quantization involves grouping similar values into clusters and representing them with a single codebook entry. This can be applied to both weights and activations. Vector quantization helps in reducing redundancy and achieving further compression.
Quantization-Aware Training:
- This technique involves training a neural network with the awareness of the subsequent quantization step. The model is trained to be more robust to the loss of precision that occurs during quantization. This can lead to better post-quantization accuracy.
Sparsity and Quantization:
- Combining quantization with sparsity techniques, such as pruning, helps further reduce the memory footprint. Pruning involves removing unnecessary connections or parameters from the model, and when combined with quantization, it can lead to significant compression.

Quantization is a trade-off between model efficiency and loss of precision. While quantization can provide substantial benefits in terms of model size and speed, careful tuning and evaluation are necessary to ensure that the compressed model still performs well on the intended tasks. There are many exciting quantization methods beyond the general techniques I mentioned! Let’s dive into the specifics of those:

GGML (Generalized Gradient Modulation Lottery):

This method combines quantization with lottery ticket hypothesis, suggesting a subset of important connections that can be preserved for accurate model representation.
It focuses on CPU inference and offers flexibility when offloading layers to the GPU for speed boosts.
It’s particularly advantageous for running LLMs on CPUs or Apple M series devices.

GGUF (GPT-Generated Unified Format) – GGUF builds upon the foundation of GGML, but significantly improves upon it in several ways:

Extensibility: GGUF is designed to be more flexible and adaptable, allowing for future updates and additions to the format without breaking compatibility with existing models.
Centralized metadata: All essential information, like special tokens and scaling parameters, are stored in a single file for convenience and clarity.
Hybrid CPU/GPU inference: GGUF models primarily run on CPUs but can offload specific layers to GPUs for performance boosts, offering a good balance between efficiency and speed.
Focus on smaller LLMs: While GGML was originally developed for larger models, GGUF shines with smaller and emerging LLMs like Mistral 7B, making them even more lightweight and accessible.

GPTQ (Generalized Post-Training Quantization):

Aims for 4-bit post-training quantization primarily focused on GPU inference and performance.
It seeks to minimize the mean squared error for each weight during quantization, achieving a good balance between size and accuracy.
During inference, it dynamically dequantizes weights to float16 for further performance improvements.

AWQ (Activation-aware Weight Quantization):

A newer approach similar to GPTQ, but it takes activation values into account when selecting weights for quantization.
This allows skipping less important weights, leading to significant speed-ups compared to GPTQ while maintaining similar or even better performance.
It’s a promising method for achieving efficient and accurate LLMs.

HQQ (Half Quantization Quantization):

HQQ requiring no calibration data, significantly speeds up the quantization of large models, while offering compression quality competitive with that of calibration-based methods.
For instance, HQQ takes less than 5 minutes to process the colossal Llama-2-70B, that’s over 50x faster compared to the widely adopted GPTQ. Our Llama-2-70B quantized to 2-bit outperforms the full-precision Llama-2-13B by a large margin for a comparable memory usage.

These are just a few examples, and the field of LLM quantization is constantly evolving. Ultimately, the best choice of quantization method depends on your specific needs and priorities. Consider factors like target hardware, desired accuracy level, available resources, and performance requirements when making your decision.

If you have any doubts/suggestions please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

Model Compression vs Model Quantization

Leave a reply

Quantization and compression are two related but distinct concepts when it comes to large language models (LLMs) like GPT-3.5. Let’s explore the differences between quantization and compression in the context of LLMs:

Quantization:
- Definition: Quantization is the process of reducing the precision or bit-width of numerical values in a model.
- Application: In the context of LLMs, quantization typically involves reducing the number of bits used to represent the weights and activations of the model. For example, instead of using 32-bit floating-point numbers, quantization may involve using 16-bit or 8-bit fixed-point numbers.
- Purpose: The primary goal of quantization is to reduce the memory footprint and computational requirements of the model, making it more efficient for deployment on devices with limited resources (such as mobile phones or edge devices).
- Trade-offs: While quantization reduces model size and speeds up inference, it may lead to a slight loss in model accuracy due to the reduced precision of numerical values.
Compression:
- Definition: Compression is the process of reducing the size of the model by removing redundant or unnecessary information.
- Application: Compression techniques can be applied to various parts of the model, such as weights, embeddings, or even intermediate representations. Popular compression techniques include weight pruning (removing small or redundant weights), knowledge distillation (training a smaller model to mimic the behavior of a larger model), and model quantization.
- Purpose: The primary goal of compression is to reduce the storage requirements of the model, making it easier to store, transfer, and deploy.
- Trade-offs: Compression techniques may also lead to a trade-off between model size and accuracy. For example, removing certain weights during pruning might result in a loss of model accuracy, although sophisticated pruning techniques aim to minimize this impact.

In summary, quantization specifically refers to the reduction of numerical precision in the model’s parameters, while compression is a broader concept that encompasses various techniques aimed at reducing the overall size of the model. Both quantization and compression are used to make LLMs more practical for deployment on resource-constrained devices or for efficient storage and transfer.

If you have any doubt/suggestion please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

Weight Pruning in Neural Networks

Leave a reply

Weight pruning is a technique used to reduce the size of a neural network by removing certain weights, typically those with small magnitudes, without significantly affecting the model’s performance. The idea is to identify and eliminate connections in the network that contribute less to the overall computation. This process helps in reducing the memory footprint and computational requirements during both training and inference.

Initial Model: Let’s consider a simple fully connected neural network with one hidden layer. The architecture might look like this:
Input layer (features) -> Hidden layer -> Output layer (predictions)

Training: The network is trained on a dataset to learn the mapping from inputs to outputs. During training, weights are adjusted through optimization algorithms like gradient descent to minimize the loss function.

Pruning: After training, weight pruning involves identifying and removing certain weights. A common criterion for pruning is to set a threshold, and weights whose absolute values fall below this threshold are pruned. For example, let’s say we have a weight matrix connecting the input layer to the hidden layer:

If we set a pruning threshold of 0.2, weights smaller than 0.2 in absolute value may be pruned. After pruning, the weight matrix might become:

Here, the connections with weights below 0.2 are pruned by setting those weights to zero.

Fine-tuning: Optionally, the pruned model can undergo fine-tuning to recover any loss in accuracy caused by pruning. During fine-tuning, the remaining weights may be adjusted to compensate for the pruned connections.

Weight pruning is an effective method for model compression, reducing the number of parameters in a neural network and making it more efficient for deployment in resource-constrained environments.

If you have any doubt/suggestion please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

Unleashing the Power of Gelu Activation: Exploring Its Uses, Applications, and Implementation in Python

Leave a reply

Introduction

In the field of deep learning, activation functions play a crucial role in introducing non-linearity to neural networks, enabling them to model complex relationships. One such activation function that has gained popularity is the Gelu activation function. Gelu stands for Gaussian Error Linear Unit, and it offers a smooth and continuous non-linear transformation. In this blog post, we will dive into the world of Gelu activation, its applications, the formula behind it, and how to implement it in Python.

Understanding Gelu Activation

The Gelu activation function was introduced in 2016 by Dan Hendrycks and Kevin Gimpel as an alternative to other popular activation functions such as ReLU (Rectified Linear Unit). Gelu is known for its ability to capture a wide range of non-linearities while maintaining smoothness and differentiability.

Formula and Characteristics

The Gelu activation function is defined mathematically as follows:

Gelu(x) =

The key characteristics of the Gelu activation function are as follows:

Range: Gelu activation outputs values in the range [0, +inf).
Differentiability: Gelu is a smooth function and possesses derivatives at all points.
Monotonicity: Gelu is a monotonically increasing function, making it suitable for gradient-based optimization algorithms.
Gaussian Approximation: Gelu approximates the cumulative distribution function (CDF) of a standard normal distribution.

Applications of Gelu Activation: Gelu activation has found applications in various domains, including:

Natural Language Processing (NLP): Gelu has shown promising results in NLP tasks such as sentiment analysis, machine translation, and text generation.
Computer Vision: Gelu activation can be used in convolutional neural networks (CNNs) for image classification, object detection, and semantic segmentation tasks.
Recommendation Systems: Gelu activation can enhance the performance of recommendation models by introducing non-linearities and capturing complex user-item interactions.

Implementing Gelu Activation in Python

Let’s see how we can implement the Gelu activation function in Python:

import torch
import torch.nn as nn
import math

class Gelu(nn.Module):
    def __init__(self):
        super(Gelu, self).__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

# Example usage
gelu = Gelu()
input = torch.randn(1, 10)  # Example input tensor
output = gelu(input)
print(output)

import torch

import torch.nn as nn

import math

class Gelu(nn.Module):

def __init__(self):

super(Gelu, self).__init__()

def forward(self, x):

return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

# Example usage

gelu = Gelu()

input = torch.randn(1, 10) # Example input tensor

output = gelu(input)

print(output)

Comparison of Gelu and ReLU Activation Functions

Formula:
- ReLU: ReLU(x) = max(0, x)
- Gelu: Gelu(x) = 0.5x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
Range:
- ReLU: ReLU outputs values in the range [0, +inf).
- Gelu: Gelu also outputs values in the range [0, +inf).
Smoothness and Continuity:
- ReLU: ReLU is a piecewise linear function and non-differentiable at x=0.
- Gelu: Gelu is a smooth and continuous function, ensuring differentiability at all points.
Monotonicity:
- ReLU: ReLU is a piecewise linear function, which means it is monotonically increasing for x > 0.
- Gelu: Gelu is a monotonically increasing function, making it suitable for gradient-based optimization algorithms.
Non-linearity:
- ReLU: ReLU introduces non-linearity by mapping negative values to 0 and preserving positive values unchanged.
- Gelu: Gelu introduces non-linearity through a combination of linear and non-linear transformations, capturing a wider range of non-linearities.
Performance:
- ReLU: ReLU has been widely used due to its simplicity and computational efficiency. However, it suffers from the “dying ReLU” problem where neurons can become inactive (outputting 0) and may not recover during training.
- Gelu: Gelu has shown promising performance in various tasks, including NLP and computer vision, and it addresses the “dying ReLU” problem by maintaining non-zero gradients for all inputs.
Applicability:
- ReLU: ReLU is commonly used in hidden layers of deep neural networks and has been successful in image classification and computer vision tasks.
- Gelu: Gelu has gained popularity in natural language processing (NLP) tasks, such as sentiment analysis and text generation, where capturing complex non-linear relationships is crucial.

Conclusion

Gelu activation offers a powerful tool for introducing non-linearity to neural networks, making them capable of modeling complex relationships. Its smoothness, differentiability, and wide range of applications make it an attractive choice in various domains, including NLP, computer vision, and recommendation systems. By implementing Gelu activation in Python, researchers and practitioners can leverage its potential and explore its benefits in their own deep learning projects. So go ahead, unleash the power of Gelu and take your models to the next level!

If you have any doubt/suggestion please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

BLEU Score – Bilingual Evaluation Understudy

Leave a reply

Introduction

The BLEU score, which stands for Bilingual Evaluation Understudy, is a metric commonly used to evaluate the quality of machine-generated translations compared to human translations. It measures the similarity between the machine-generated translation and one or more reference translations, assigning a numerical score between 0 and 1. The higher the BLEU score, the closer the machine translation is to the reference translations, indicating better translation quality. BLEU score takes into account factors such as n-gram precision and brevity penalty, providing a useful quantitative measure for comparing different translation systems or assessing improvements in machine translation over time. Don’t worry, we will discuss these terms as we go along with the blog.

Precision

Input Sentence: “Hay un tigre en el bosque”
Human Reference: “There is a tiger in the woods”

Lets assume machine translated output is: “the the the the the”
Accuracy of the machine-generated translation compared to the reference translations can be calculated using precision. Precision basically checks for each word in generated output if it is present in reference sentence or not. So in the given example it will be 5/5. It gives high value even the machine translated output is far away from reference sentence. There comes modified precision. In modified precision we calculate the maximum frequency of word present in the reference sentence. Which will compute to 1/5. This one was for unigram (one word at a time). Similarly it is calculated for n-gram.

Formula

The formula for BLEU score with brevity penalty is as follows:

BLEU = BP * exp(sum(n-gram precision) / N)

Where:

BP (Brevity Penalty) is a penalty term that adjusts the BLEU score based on the brevity of the machine generated translation compared to the reference translations.
n-gram precision is the precision of n-grams (substrings of length n) in the machine generated translation, which is the count of n-gram matches between the machine generated and reference translations divided by the count of n-grams in the machine generated translation.
N is the maximum n-gram order considered in the calculation (typically 4).

The brevity penalty term BP is calculated as:

BP = 1, if c > r
BP = exp(1 – r/c), if c ≤ r

Where:

c is the length (in words) of the machine generated translation.
r is the length (in words) of the closest reference translation.

In this formula, the brevity penalty is applied to adjust the BLEU score based on the difference in length between the candidate and reference translations. If the candidate translation is shorter than the reference, the penalty term encourages longer translations, and if the candidate translation is longer, it discourages excessively long translations.

Implementation

import nltk
nltk.download('punkt')
import math
from collections import Counter

def tokenize(sentence):
    return nltk.word_tokenize(sentence)

def calculate_ngram(candidate, n):
    ngrams = []
    for i in range(len(candidate)-n+1):
        ngram = tuple(candidate[i:i+n])
        ngrams.append(ngram)
    return ngrams

def calculate_precision(candidate, references, n):
    candidate_ngrams = calculate_ngram(candidate, n)
    reference_ngrams = [calculate_ngram(ref, n) for ref in references]

    candidate_counter = Counter(candidate_ngrams)
    reference_counters = [Counter(ref) for ref in reference_ngrams]

    clipped_counts = dict()
    for ngram, count in candidate_counter.items():
        max_reference_count = max(ref_counter[ngram] for ref_counter in reference_counters)
        clipped_counts[ngram] = min(count, max_reference_count)

    numerator = sum(clipped_counts.values())
    denominator = max(1, sum(candidate_counter.values()))

    precision = numerator / denominator
    return precision

def calculate_bleu(candidate, references, weights):
    candidate_tokens = tokenize(candidate)
    reference_tokens = [tokenize(ref) for ref in references]

    precisions = []
    for n in range(1, len(weights) + 1):
        precision = calculate_precision(candidate_tokens, reference_tokens, n)
        precisions.append(precision)

    # Handling NaN or infinite values in precision
    precisions = [p if not math.isnan(p) and p != 0.0 else 1e-10 for p in precisions]

    geo_mean = math.exp(sum((w * math.log(p) for w, p in zip(weights, precisions))) / len(weights))
    brevity_penalty = min(1.0, len(candidate_tokens) / min(len(ref) for ref in reference_tokens))

    bleu = brevity_penalty * geo_mean
    return bleu

# Example usage
candidate = "The cat is on the mat"
references = ["There is a cat on the mat", "The mat has a cat"]
weights = [0.25, 0.25, 0.25, 0.25]

bleu_score = calculate_bleu(candidate, references, weights)
print("BLEU score:", bleu_score)

import nltk

nltk.download('punkt')

import math

from collections import Counter

def tokenize(sentence):

return nltk.word_tokenize(sentence)

def calculate_ngram(candidate, n):

ngrams = []

for i in range(len(candidate)-n+1):

ngram = tuple(candidate[i:i+n])

ngrams.append(ngram)

return ngrams

def calculate_precision(candidate, references, n):

candidate_ngrams = calculate_ngram(candidate, n)

reference_ngrams = [calculate_ngram(ref, n) for ref in references]

candidate_counter = Counter(candidate_ngrams)

reference_counters = [Counter(ref) for ref in reference_ngrams]

clipped_counts = dict()

for ngram, count in candidate_counter.items():

max_reference_count = max(ref_counter[ngram] for ref_counter in reference_counters)

clipped_counts[ngram] = min(count, max_reference_count)

numerator = sum(clipped_counts.values())

denominator = max(1, sum(candidate_counter.values()))

precision = numerator / denominator

return precision

def calculate_bleu(candidate, references, weights):

candidate_tokens = tokenize(candidate)

reference_tokens = [tokenize(ref) for ref in references]

precisions = []

for n in range(1, len(weights) + 1):

precision = calculate_precision(candidate_tokens, reference_tokens, n)

precisions.append(precision)

# Handling NaN or infinite values in precision

precisions = [p if not math.isnan(p) and p != 0.0 else 1e-10 for p in precisions]

geo_mean = math.exp(sum((w * math.log(p) for w, p in zip(weights, precisions))) / len(weights))

brevity_penalty = min(1.0, len(candidate_tokens) / min(len(ref) for ref in reference_tokens))

bleu = brevity_penalty * geo_mean

return bleu

# Example usage

candidate = "The cat is on the mat"

references = ["There is a cat on the mat", "The mat has a cat"]

weights = [0.25, 0.25, 0.25, 0.25]

bleu_score = calculate_bleu(candidate, references, weights)

print("BLEU score:", bleu_score)

Here’s a breakdown of the code:

Tokenization:
- The tokenize function splits a given sentence into individual words or tokens.
N-gram Calculation:
- The calculate_ngram function takes a list of tokens (words) and an integer n as input, and it returns a list of n-grams (contiguous sequences of n tokens) from the input list.
Precision Calculation:
- The calculate_precision function computes the precision score for a given candidate sentence in comparison to one or more reference sentences. It uses n-grams for this calculation.
- It counts the occurrences of n-grams in both the candidate and reference sentences and computes a precision value.
BLEU Calculation:
- The calculate_bleu function takes a candidate sentence, a list of reference sentences, and a list of weights as input.
- It tokenizes the input sentences, calculates precision for different n-gram sizes, and combines them using a weighted geometric mean.
- The BLEU score is a combination of precision values for different n-gram sizes, and the weights are used to assign importance to each n-gram size.
Example Usage:
- An example is provided at the end, where a candidate sentence (“The cat is on the mat”) is compared to two reference sentences (“There is a cat on the mat” and “The mat has a cat”).
- The weights for different n-gram sizes are set to equal values (0.25 each), and the BLEU score is calculated using the calculate_bleu function.
- The final BLEU score is printed out.

If you have any doubt/suggestion please feel free to ask and I will do my best to help or improve myself. Good-bye until next time.

Computer Vision Quiz-5

1 Reply

Q1. For a multi-channel input feature map, we apply Max-pooling independently on each channel and then concatenate the results along the channel axis?

True
False

Answer: 1
Explanation: Max-pooling operation is applied independently on each channel and then the results are concatenated along the channel axis to form the final output. Refer to this beautiful explanation by Andrew Ng to understand more.

Q2. A fully convolutional network can take as input the image of any size?

True
False

Answer: 1
Explanation: Because a fully convolutional network doesnot contain any fully connected or Dense layer, this can take as input the image of any size.

Q3. In R-CNN, the bounding box loss is only calculated for positive samples (samples that contains classes present in the dataset)?

True
False

Answer: 1
Explanation: In R-CNN, the bounding box loss is only calculated for positive samples (samples that contains classes present in the dataset) as it makes no sense to fine-tune a bounding box that doesn’t contain object.

Q4. In the VGG16 model, we have all the Conv layers with same padding and filter size and the downsampling is done by MaxPooling only?

True
False

Answer: 1
Explanation: Earlier models like AlexNet use large filter size in the beginning and downsampling was done either by max-pooling or by convolution. But in the VGG16 model, we have all the Conv layers with same padding and filter size and the downsampling is done by MaxPooling only. So what have we gained by using, for instance, a stack of three 3×3 conv. layers instead of a single 7×7 layer? First, we incorporate three non-linear rectification layers instead of a single one, which makes the decision function more discriminative. Second, we decrease the number of parameters. Refer to the Section 2.3 of this research paper to understand more.

Q5. 1×1 convolution can also help in decreasing the computation cost of a convolution operation?

True
False

Answer: 1
Explanation: 1×1 convolution can also help in decreasing the computation cost of a convolution operation. Refer to this beautiful explanation by Andrew Ng to understand more.

Q6. Can we use Fully Convolutional Neural Networks for object detection?

Answer: 1
Explanation: Yes a Fully Convolutional Neural Networks can be used for object detection. For instance, YOLO etc.

Q7. Which of the following networks can be used for object detection?

Overfeat
Faster RCNN
YOLO
All of the above

Answer: 4
Explanation: All of the above mentioned networks can be used for object detection. For instance, Faster RCNN belongs to Region based methods whereas YOLO, Overfeat belongs to sliding window based methods.

Q8. AlexNet was one of the first networks that uses ReLU activation function in the hidden layers instead of tanh/sigmoid (which were quite common at that time)?

True
False

Answer: 1
Explanation: This was one of the revolutionary ideas that boomed deep learning i.e. using ReLU activation function in the hidden layers instead of tanh/sigmoid (which were quite common at that time).

Computer Vision Quiz-4

Leave a reply

Q1. The values in a filter/mask are called as

Coefficients
Weights
Both of the above
None of the above

Answer: 3
Explanation: The values in a filter/mask are called as either coefficients or weights.

Q2. Which of the following networks uses the idea of Depthwise Separable Convolutions?

AlexNet
MobileNet
ResNet
VGG16

Answer: 2
Explanation: As mentioned in the MobileNet paper, MobileNets are based on a streamlined architecture that uses depthwise separable convolutions to build light weight deep neural networks that work even in low compute environment, such as a mobile phones. Refer to this research paper to understand more.

Q3. What is the output of a Region Proposal Network (RPN) at each sliding window location if we have k anchor boxes?

2k scores and 4k bounding box coordinates
4k scores and 2k bounding box coordinates
k scores and 4k bounding box coordinates
4k scores and 4k bounding box coordinates

Answer: 1
Explanation: In a Region Proposal Network (RPN), for k anchor boxes we get 2k scores (that estimate probability of object or not) and 4k bounding box coordinates corresponding to each sliding window location. Refer to Figure 3 of this research paper to understand more.

Q4. Which of the following networks uses Skip-connections?

DenseNet
ResNet
U-Net
All of the above

Answer: 4
Explanation: All of the above mentioned networks uses Skip-connections.

Q5. For binary classification, we generally use ________ activation function in the output layer?

Tanh
ReLU
Sigmoid
Leaky ReLU

Answer: 3
Explanation: For binary classification, we want the output (y) to be either 0 or 1. Because sigmoid outputs the P(y=1|x) and has value between 0 and 1, so it is appropriate for binary classification.

Q6. In ResNet’s Skip-connection, the output from the previous layer is ________ to the layer ahead?

added
concatenated
convoluted
multiplied

Answer: 1
Explanation: In ResNet’s Skip-connection, the output from the previous layer is added to the layer ahead. Refer to the Figure 2 of this research paper to understand more.

Q7. In Fast R-CNN, we extract feature maps from the input image only once as compared to R-CNN where we extract feature maps from each region proposal separately?

True
False

Answer: 1
Explanation: Earlier in R-CNN we were extracting features from each region proposals separately using a CNN and this was very time consuming. So, to counter this, in Fast R-CNN we extract feature maps from the input image only once and then project the region proposals onto this feature map. This saves a lot of time. Refer to this link to understand more.

Q8. For Multiclass classification, we generally use ________ activation function in the output layer?

Tanh
ReLU
Sigmoid
Softmax

Answer: 4
Explanation: For Multiclass classification, we generally use softmax activation function in the output layer. Refer to this beautiful explanation by Andrew Ng to understand more.

Computer Vision Quiz-3

Leave a reply

Q1. Which of the following object detection networks uses a ROI Pooling layer?

R-CNN
Fast R-CNN
YOLO
All of the above

Answer: 2
Explanation: Out of the above mentioned networks, only Fast R-CNN uses a ROI Pooling layer. Becuase of this, Fast R-CNN can take any size image as input as compared to R-CNN where we need to resize region proposals before passing into CNN. Refer to this research paper to understand more.

Q2. Which of the following techniques can be used to reduce the number of channels/feature maps?

Pooling
Padding
1×1 convolution
Batch Normalization

Answer: 3
Explanation: 1×1 convolution can be used to reduce the number of channels/feature maps. Refer to this beautiful explanation by Andrew Ng to understand more.

Q3. Which of the following networks has the fastest prediction time?

R-CNN
Fast R-CNN
Faster R-CNN

Answer: 3
Explanation: As clear from the name, Faster R-CNN has the fastest prediction time. Refer to this research paper to understand more.

Q4. Max-Pooling makes the Convolutional Neural Network translation invariant (for small translations of the input)?

True
False

Answer: 1
Explanation: According to Ian Goodfellow, Max pooling achieves partial invariance to small translations because the max of a region depends only on the single largest element. If a small translation doesn’t bring in a new largest element at the edge of the pooling region and also doesn’t remove the largest element by taking it outside the pooling region, then the max doesn’t change.

Q5. What do you mean by the term “Region Proposals” as used in the R-CNN paper?

regions of an image that could possibly contain an object of interest
regions of an image that could possibly contain information other than the object of interest
final bounding boxes given by the R-CNN

Answer: 1
Explanation: As clear from the name, Region Proposals are a set of candidate regions that could possibly contain an object of interest. These region proposals are then fed to a CNN which extracts features from each of these proposals and these features are then fed to a SVM classifier to determine what type of object (if any) is contained within the proposal. The main reason behind extracting these region proposals beforehand is that instead of searching the object at all image locations, we should search for only those locations where there is a possibility of object. This will reduce the false positives as we are only searching in the regions where there is a possibility of having an object. Refer to this research paper to understand more.

Q6. Because Pooling layer has no parameters, they don’t affect the gradient calculation during backpropagation?

True
False

Answer: 2
Explanation: It is true that Pooling layer has no parameters and hence no learning takes place during backpropagation. But it’s wrong to say that they don’t affect the gradient calculation during backpropagation because pooling layer guides the gradient to the right input from which the Pooling output came from. Refer to this link to know more.

Q7. Which of the following techniques was used by Traditional computer vision object detection algorithms to locate objects in images at varying scales and locations?

image pyramids for varing scale and sliding windows for varing locations
image pyramids for varing locations and sliding windows for varing scale

Answer: 1
Explanation: Becuase an object can be of any size and can be present at any location, so for object detection we need to search both at different locations and scales. As we know that by using image pyramids (multi-resolution representations for images) we can handle scale dependency and for locations we can use sliding window. So, traditional computer vision algorithms use these for object detection. For instance, refer to the Overfeat paper that shows how a multiscale and sliding window approach can be efficiently implemented within a ConvNet.

Q8. How do you introduce non-linearity in a Convolutional Neural Network (CNN)?

Using ReLU
Using a Max-Pooling layer
Both of the above
None of the above

Answer: 3
Explanation: Non-linearity can be introduced by either using ReLU (non-linear activation function) or by using a Max-Pooling layer (as max is a non-linear function).