Source code for FairLangProc.metrics.probability

"""Submodule inside of the FairLangProc.metrics module which stores all methods and metrics related
with Language Modelling.

The supported metrics are LPBS, CBS, CPS, AUL.
"""

# Standard libraries
from typing import TypeVar

# Numpy
import numpy as np

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F

TokenizerType = TypeVar("TokenizerType", bound="PreTrainedTokenizer")

def MaskProbability(
    model: nn.Module,
    tokenizer: TokenizerType,
    sentences: list[str],
    target_words: list[str],
    mask_indices: list[int],
    how_many: int = 2
    ) -> torch.Tensor:
    r"""Computation of masked probability with a Language Model.
    
    Computes the probability of a list of target words in the positions of certain masks given a list
    of masked sentences (the number of masks is assumed to be constant)

    Parameters
    ----------
    model : nn.Module
        Language Model used to compute probabilities.
    tokenizer : TokenizerType
        Tokenizer associated with the model.
    sentences : list[str]
        List of sentences with masks.
    target_words : list[str]
        List of words whose probabilities we want to compute.
    mask_indices : list[int]
        List of indices which indicate to which mask of the sentence
        each word corresponds to (i.e. first, second,...)
    how_many : int
        How many masks are in each sentence

    Returns
    -------
    prob_target : torch.Tensor
        Probability of target_words in the positions indicated by mask_indices.

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
    >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    >>> sentences = ["The [MASK] is a [MASK]", "[MASK] is such a [MASK]"]
    >>> target_words = ["engineer", "He"]
    >>> mask_indices = [0,1]
    >>> how_many = 2
    >>> 
    >>> probabilities = MaskProbability(model, tokenizer, sentences, target_words, mask_indices, how_many = how_many)
    """

    if not isinstance(mask_indices, np.ndarray):
        mask_indices = np.array(mask_indices)

    nSent = len(sentences)
    sentRange = np.arange(nSent)

    assert nSent == len(target_words), "Different number of sentences and target words."
    assert nSent == len(mask_indices), "Different number of sentences and mask indices."

    input_ids = tokenizer(sentences, padding = True, return_tensors="pt")
    target_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word)[0]) for word in target_words]
    mask_index = torch.where(input_ids.input_ids == tokenizer.mask_token_id)

    with torch.no_grad():
        outputs = model(**input_ids)
        logits = outputs.logits

    probs = F.softmax(logits, dim = -1)
    
    if how_many == 1:
        mask_position = sentRange
    else:
        mask_position = how_many*sentRange + mask_indices
    
    prob_targets = probs[sentRange, mask_index[1][mask_position], target_ids]

    return prob_targets


def MaskProbabilityQuotient(
    model: nn.Module,
    tokenizer: TokenizerType,
    sentences: list[str],
    target_words: list[tuple[str]],
    fill_words: list[str],
    mask_indices: list[bool]
    ) -> list[torch.Tensor]:

    r"""Computes the quotient of the probabilities of two different words in the same spot in a sentence.

    Assumes sentences with two masks. Computes the quotient of the probability of target_words being in
    the position of mask_indices divided by the prior probability of target_words in said position but with
    fill_words masked.

    Parameters
    ----------
    model : nn.Module
        Language Model used to compute probabilities.
    tokenizer : TokenizerType
        Tokenizer associated with the model.
    sentences : list[str]
        List of sentences with masks.
    target_words : list[tuple[str]]
        List containing tuples of words whose probabilities we want to compute.
    fill_words : list[str]
        List of words which replace the secondary mask.
    mask_indices : list[int]
        List of indices which indicate to which mask of the sentence each
        target word corresponds to (i.e. first (0) or second (1)).

    Returns
    -------
    probs : list[torch.Tensor]
        Quotients of probabilities given as a list of tensors

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
    >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    >>> sentences = ["The [MASK] is a [MASK]", "[MASK] is such a [MASK]"]
    >>> target_words = [("man", "woman"), ("He", "She")]
    >>> fill_words = ["engineer", "drag"]
    >>> mask_indices = [1,0]
    >>> 
    >>> quotients = MaskProbabilityQuotient(model, tokenizer, sentences, target_words, fill_word, mask_indices)
    """
    
    n_cat = len(target_words[0])
    n_sentences = len(target_words)

    try:
        fill_indices = 1 - mask_indices
    except TypeError:
        fill_indices = 1 - np.array(mask_indices)

    filled_sentences = [
        template.replace("[MASK]", word, index)
        for word, template, index in zip(fill_words, sentences, fill_indices)
    ]

    probs = []

    for cat in range(n_cat):
        words = [word_tuple[cat] for word_tuple in target_words]
        
        prior_probs = MaskProbability(model, tokenizer, sentences, words, mask_indices, how_many = 2)
        post_probs = MaskProbability(model, tokenizer, filled_sentences, words, mask_indices, how_many = 1)
        prob_quotient = post_probs/prior_probs
        probs.append(prob_quotient)

    return probs



[docs]
def LPBS(
    model: nn.Module,
    tokenizer: TokenizerType,
    sentences: list[str],
    target_words: list[tuple[str]],
    fill_words: list[str],
    mask_indices: list[int] = None
    ) -> torch.Tensor:
    r"""Computes LPBS score for a list of tuples of dimension 2 of target words.

    Parameters
    ----------
    model : nn.Module                  
        Language model used to compute probabilities.
    tokenizer : TokenizerType              
        Tokenizer associated with the model.
    sentences : list[str]              
        List of sentences with masks.
    target_words : list[tuple[str]]    
        List containing tuples of words whose probabilities we want to compute.
    fill_words : list[str]             
        List of words which replace the secondary mask.
    mask_indices : list[int]           
            List of indices which indicate to which mask of the sentence each 
            target word corresponds (i.e. first (0) or second (1)).

    Returns
    -------
    probs : torch.Tensor               
        List of LPBS scores

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
    >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    >>> sentences = ["[MASK] is a [MASK].", "[MASK] is a [MASK].", "The [MASK] was a [MASK]."]
    >>> target_words = [("John", "Mary"), ("He", "She"), ("man", "woman")]
    >>> fill_words = ["engineer","nurse","doctor"]
    >>> mask_indices = [0, 0, 1]
    >>> 
    >>> LPBSscore = LPBS(
    ...     model = model,
    ...     tokenizer = tokenizer,
    ...     sentences = sentences,
    ...     target_words = target_words,
    ...     fill_words = fill_words,
    ...     mask_indices = mask_indices
    ... )
    """

    assert len(sentences) == len(fill_words), "Different number of sentences and fill words."
    assert len(sentences) == len(target_words), "Different number of sentences and target words."
    assert len(target_words[0]) == 2, "Target words must consist of pairs of words."

    if mask_indices is None:
        mask_indices = [0 for i in range(len(sentences))]

    probs = MaskProbabilityQuotient(model, tokenizer, sentences, target_words, fill_words, mask_indices)
    scores = torch.log(probs[0]) - torch.log(probs[1])
    return scores




[docs]
def CBS(
    model: nn.Module,
    tokenizer: TokenizerType,
    sentences: list[str],
    target_words: list[tuple[str]],
    fill_words: list[str],
    mask_indices: list[int]
    ) -> torch.Tensor:
    r"""Computes CBS score for a list of tuples of dimension n of target words.

    Parameters
    ----------

    model : nn.Module                  
        Language model used to compute probabilities.
    tokenizer : TokenizerType              
        Tokenizer associated with the model
    sentences : list[str]     
        List of sentences with masks
    target_words : list[tuple[str]]  
        List containing tuples of words whose probabilities we want to compute
    fill_words : list[str]          
        List of words which replace the secondary mask
    mask_indices : list[int]          
        List of indices which indicate to which mask of the sentence
        each target word corresponds (i.e. first (0) or second (1))
    
    Returns
    -------
    probs : torch.Tensor
        List of CBS scores

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
    >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    >>> target_words = [("John", "Mamadouk", "Liu"), ("white", "black", "asian"), ("white", "black", "asian")]
    >>> sentences = ["[MASK] is a [MASK]", "The [MASK] kid got [MASK] results", "The [MASK] kid wanted to be a [MASK]"]
    >>> fill_words = ["engineer", "outstanding", "doctor"]
    >>> mask_indices = [0, 1, 1]
    >>> 
    >>> CBSscore = CBS(
    ...     model = model,
    ...     tokenizer = tokenizer,
    ...     sentences = sentences,
    ...     target_words = target_words,
    ...     fill_words = fill_words,
    ...     mask_indices = mask_indices
    ... )
    """

    assert len(sentences) == len(fill_words), "Different number of sentences and fill words."
    assert len(sentences) == len(target_words), "Different number of sentences and target words."

    if mask_indices is None:
        mask_indices = [0 for i in range(len(sentences))]

    probs = MaskProbabilityQuotient(model, tokenizer, sentences, target_words, fill_words, mask_indices)
    probs = torch.stack(probs, dim = 1)
    scores = torch.var(torch.log(probs), dim = 1)
    return scores




def MaskedPseudoLogLikelihood(
    model: nn.Module,
    input_ids: list[int],
    target_id: int,
    mask_id: int,
    cls_id: int,
    pad_id: int
    ) -> float:
    """Computes the PLL score for a sentence where all words are progressively masked with the exception of a word
    given by target_id.

    Parameters
    ----------
    model : nn.Module
        Language model used to compute probabilities.
    input_ids : list[int]
        List of tokens forming the sentence.
    target_id : int        
        Id of the token which should not be masked.
    mask_id : int          
        Id of the mask token.
    cls_id : int         
        Id of the cls token.
    pad_id : int       
        Id of the pad token.

    Returns
    -------
    score : float
        PLL of the masked sentence.

    Example
    --------
    >>> model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    >>> sentence = 'The actor did a terrible job'
    >>> input_ids = tokenizer([sentence], return_tensors = 'pt')['input_ids']
    >>> target_id = tokenizer.convert_tokens_to_ids(tokenizer.tokenize('actor')[0])
    >>> mask_id = tokenizer.mask_token_id
    >>> pad_id = tokenizer.pad_token_type_id
    >>> cls_id = tokenizer.cls_token_id
    >>> 
    >>> score = MaskedPseudoLogLikelihood(
    ...     model = model,
    ...     input_ids = input_ids,
    ...     target_id = target_id,
    ...     mask_id = mask_id,
    ...     pad_id = pad_id,
    ...     cls_id = cls_id
    ... )
    """

    for i in range(len(input_ids)):
        if input_ids[i] != cls_id:
            start = i
            break

    for i in reversed(range(len(input_ids))):
        if input_ids[i] != pad_id:
            end = i
            break  

    masked_sentences = []
    masked_words = []
    target_id_position = None

    for i in range(start, end):
        if input_ids[i] == target_id:
            target_id_position = i
            continue
        sent_clone = input_ids.clone().detach()
        masked_words.append(input_ids[i])
        sent_clone[i] = mask_id
        masked_sentences.append(sent_clone)

    masked_sentences = torch.stack(masked_sentences, dim = 0)
    masked_words = torch.tensor(masked_words)

    with torch.no_grad():
        outputs = model(masked_sentences)
        logits = outputs.logits
        logProb = torch.log(F.softmax(logits, dim = 1))

    if not target_id_position:
        indices_dim0 = torch.arange(logProb.size(0))
        indices_dim1 = torch.arange(start, end)
        indices_dim2 = masked_words


    else:
        index = target_id_position - start

        indices_dim0_seg1 = torch.arange(index)
        indices_dim1_seg1 = torch.arange(start, target_id_position)
        indices_dim2_seg1 = masked_words[:index]

        indices_dim0_seg2 = torch.arange(index, logProb.size(0))
        indices_dim1_seg2 = torch.arange(target_id_position+1, end)
        indices_dim2_seg2 = masked_words[index:]

        indices_dim0 = torch.cat([indices_dim0_seg1, indices_dim0_seg2])
        indices_dim1 = torch.cat([indices_dim1_seg1, indices_dim1_seg2])
        indices_dim2 = torch.cat([indices_dim2_seg1, indices_dim2_seg2])


    score = torch.sum(logProb[indices_dim0, indices_dim1, indices_dim2])

    return score.item()





[docs]
def CPS(
    model: nn.Module,
    tokenizer: TokenizerType,
    sentences: list[str],
    target_words: list[str]
    ) -> list[float]:
    r"""Computes the CPS score for list of sentences.

    Parameters
    ----------
    model : nn.Module
        Language model used to compute probabilities.
    tokenizer : TokenizerType
        Tokenizer associated with the model.
    sentences : list[str]
        List of sentences for whom we will compute the CPS score.
    target_words : list[str]
        List of target words which should not be masked.

    Returns
    -------
    score : list[float]
        List of CPS score of the sentences.

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    >>> sentences = ['The actor did a terrible job', 'The actress did a terrible job', 'The doctor was an exemplary man', 'The doctor was an exemplary woman']
    >>> target_words = ['actor', 'actress', 'man', 'woman']
    >>> 
    >>> CPSscore = CPS(
    ...     model = model,
    ...     tokenizer = tokenizer,
    ...     sentences = sentences,
    ...     target_words = target_words
    ... )
    """

    assert len(sentences) == len(target_words), "Number of sentences and target words must be the same."
    assert len(sentences) != 0, "Empty sentence list."

    input_ids = tokenizer(sentences, return_tensors="pt")
    ids = input_ids['input_ids']
    target_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word)[0]) for word in target_words]
    mask_index = torch.where(input_ids.input_ids == tokenizer.mask_token_id)

    mask_id = tokenizer.mask_token_id
    pad_id = tokenizer.pad_token_type_id
    cls_id = tokenizer.cls_token_id

    scores = []

    for sentence in range(len(sentences)):
        
        sent = ids[sentence]
        target_id = target_ids[sentence]
        score = 0

        score = MaskedPseudoLogLikelihood(
            model = model,
            input_ids = sent,
            target_id = target_id,
            mask_id = mask_id,
            cls_id = cls_id,
            pad_id = pad_id
            )     
        scores.append(score)

    return scores




def UnMaskedPseudoLogLikelihood(
    model: nn.Module,
    input_ids: list[int],
    cls_id: int,
    pad_id: int
    ) -> float:
    r"""Computes the PLL score of an unmasked sentence.

    Parameters
    ----------
    model : nn.Module      
        Language model used to compute probabilities.
    input_ids : list[int]
        List of tokens forming the sentence.
    cls_id : int
        Id of the cls token.
    pad_id : int
        Id of the pad token.

    Returns
    -------
    score : float
        PLL of the masked sentence.

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    >>> sentence = 'The actor did a terrible job'
    >>> input_ids = tokenizer([sentence], return_tensors = 'pt')['input_ids']
    >>> pad_id = tokenizer.pad_token_type_id
    >>> cls_id = tokenizer.cls_token_id
    >>> 
    >>> score = UnMaskedPseudoLogLikelihood(
    ...     model = model,
    ...     input_ids = input_ids,
    ...     pad_id = pad_id,
    ...     cls_id = cls_id
    ... )
    """

    for i in range(len(input_ids)):
        if input_ids[i] != cls_id:
            start = i
            break

    for i in reversed(range(len(input_ids))):
        if input_ids[i] != pad_id:
            end = i
            break  

    input_ids = input_ids.unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
        logProb = torch.log(F.softmax(logits, dim = 1))

    indices_dim0 = torch.arange(logProb.size(0))
    indices_dim1 = torch.arange(start, end)
    indices_dim2 = input_ids.squeeze()[start:end]

    score = torch.mean(logProb[indices_dim0, indices_dim1, indices_dim2])

    return score.item()




[docs]
def AUL(
    model: nn.Module,
    tokenizer: TokenizerType,
    sentences: list[str]
    ) -> list[float]:

    r"""Computes the AUL score for list of sentences.

    Parameters
    ----------
    model : nn.Module
        Language model used to compute probabilities.
    tokenizer : TokenizerType
        Tokenizer associated with the model.
    sentences : list[str]
        List of sentences for whom we will compute the AUL score.

    Returns
    -------
    score : list[float]
        List of AUL score of the sentences.

    Example
    -------
    >>> model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    >>> sentences = ['The actor did a terrible job', 'The actress did a terrible job', 'The doctor was an exemplary man', 'The doctor was an exemplary woman']
    >>> 
    >>> AULscore = AUL(
    ...     model = model,
    ...     tokenizer = tokenizer,
    ...     sentences = sentences
    ... )
    """

    assert len(sentences) != 0, "Empty sentence list."
    
    input_ids = tokenizer(sentences, return_tensors="pt")
    ids = input_ids['input_ids']

    pad_id = tokenizer.pad_token_type_id
    cls_id = tokenizer.cls_token_id

    scores = []

    for sentence in range(len(sentences)):
        
        sent = ids[sentence]
        score = 0

        score = UnMaskedPseudoLogLikelihood(
            model = model,
            input_ids = sent,
            cls_id = cls_id,
            pad_id = pad_id
            )     
        scores.append(score)

    return scores
Source code for FairLangProc.metrics.probability

FairLangProc

Navigation

Related Topics