Source code for peptidedigest.model_prompts

"""
Functions for generating model prompts for the Peptide Digest LLM.
"""

import re

from .clean_text import clean_summary


# Update the summarize_article_segments function to use this enhanced clean_summary

[docs]
def summarize_article_segments(fulltext, tokenizer, model):
    """
    Summarizes a scientific article into bullet points and a concise summary.

    Parameters
    ----------
    fulltext : list of str
        A list of text chunks from a scientific article.

    Returns
    -------
    final_summary : str
        A concise summary of the scientific article.

    bullet_points : str
        Bullet points summarizing the scientific article.
    """
    bullet_points = ""
    for text in fulltext:
        input_text = f"""
        <start_of_turn>user
        Generate a 6 sentence summary of the following portion of a scientific article, make sure to capture results/revelations.

        {text}
        <end_of_turn>

        <start_of_turn>model-gemma
        """

        input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**input_ids, max_new_tokens=8000)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response_start = summary.find("model-gemma") + len("model-gemma")
        part_bullet_points = summary[response_start:].strip()
        cleaned_bullet_points = clean_summary(part_bullet_points)
        bullet_points += cleaned_bullet_points + "\n"

    final_summary_input = f"""<start_of_turn>user
    
    Generate a 5 bullet point summary of the following scientific texts, the bullet points should be an effective summary of the article/ study and touch on any results,revelations, chemistry/ biology.((only give bullet points)) (5-6 bullet points total)
    
    {bullet_points}
    
    <end_of_turn>
    
    <start_of_turn>model-gemma
    """

    final_input_ids = tokenizer(final_summary_input, return_tensors="pt").to("cuda")
    final_outputs = model.generate(
        **final_input_ids, max_new_tokens=8000, no_repeat_ngram_size=2
    )
    final_summary = tokenizer.decode(final_outputs[0], skip_special_tokens=True)

    response_start = final_summary.find("model-gemma") + len("model-gemma")
    final_summary = final_summary[response_start:].strip()

    # print(final_summary + "\n" + bullet_points)
    return final_summary, bullet_points




[docs]
def summarize_article_meta(fulltext, tokenizer, model):
    bullet_points = ""
    x = 0
    for text in fulltext:
        x += 1
        input_text = f"""
        <start_of_turn>user
        fill in the metadata below from the scientific article piece given. this is all you should do, take great effort to create many insightful bullet points for each topic :
        
        
        metadata topics to fill in:
        **Peptides discussed:**
        - fill in bullet points here be specififc


        **Proteins/targets discussed:**
        - fill in bullet points here be specific


        **Domains of interest:**
        - fill in bullet points here make inferences be specific
        
        **Chemical matter/ chemistry discussed:**
        - fill in bullet points here make inferences the article will talk about chemistry be specific
        
        **biological matter/ biology discussed:**
        - fill in bullet points here make inferences the text will talk about biology be specififc
        
        **computational methods**
        -fill in bullet points here make inferences the text is about a computational article be specififc
        
        text:
        {text}
        <end_of_turn>

        <start_of_turn>model-gemma
        """

        input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**input_ids, max_new_tokens=8000)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response_start = summary.find("model-gemma") + len("model-gemma")
        part_bullet_points = summary[response_start:].strip()
        cleaned_bullet_points = clean_summary(part_bullet_points)
        # bullet_points += cleaned_bullet_points + "\n"
        # print(f"Chunk {x} \n " + cleaned_bullet_points + "\n")
        bullet_points += "Chunk " + str(x) + "\n" + cleaned_bullet_points + "\n"

    return bullet_points




[docs]
def score_texts_peptide_research(
    texts_to_score, summary, bullet_points, metadata, tokenizer, model
):
    scores = []  # Initialize a list to hold the scores for each text
    score_values = []
    lowest_scores = []  # hold the final int

    for text_to_score in texts_to_score:
        # Prepare the input text by incorporating the current text to score into the scoring criteria template
        input_text = f"""
        <start_of_turn>user
        you are to score the text below based on the scoring metrics:

scoring metrics:
     **9-10: Exceptionally Relevant**
   - The text significantly advances peptide research, introducing novel peptides or mechanisms.
   - It mentions unnatural amino acids and demonstrates experimental validation with clear results and uses computational methods.
   - Discusses specific protein targets with detailed computational models or simulations, contributing substantial insights into peptide design or function.
   - May include groundbreaking findings that have a strong potential impact on the field, including therapeutic applications.

 **7-8: Highly Relevant**
   - The text is directly relevant to peptide research, and some form of experimental validation or experimentation.
   - Talks about protein targets and is relevant to computational peptide research, computational methods are used.
   - Includes sound methodology and results that support the findings discussed.
   - The research has a clear application or implication for the field, such as suggesting new areas of study or potential therapeutic uses.

**5-6: Moderately Relevant**
   - The text mentions peptide research but might not delve into specifics about unnatural amino acids or detailed experimental validation.
   - The discussion on protein targets or computational models may be present but lacks depth.
   - The methodology is sound, but the impact on the field might be moderate or not immediately clear.
   - Potential applications or implications for peptide research are suggested but not thoroughly explored.

 **3-4: Somewhat Relevant**
   - The text briefly mentions aspects of peptide research but lacks specificity or detailed discussion.
   - Experimental validation, if mentioned, is vague or general.
   - There is minimal mention of protein targets or computational research, with little to no discussion on the implications or applications.
   - The relevance to current trends or issues in peptide research is minimal or tangential.

**1-2: Irrelevant**
   - The text has little to no mention of peptide research, unnatural amino acids, or experimental validation related to peptides.

        
        text to score:
        "{metadata}
        
        {summary}    
        
        {bullet_points}"
        
        remember to follow the scoring metric we are trying to assess how relevant the text was to peptide research and if it should be a priority for a peptide researcher to read it, if the majority of the discussion doesnt involve peptides its probably a low score
        
scoring metrics:

**9-10: Exceptionally Relevant**
- Advances peptide research with novel findings, including new peptides, mechanisms, or therapeutic applications.
- Demonstrates robust experimental validation and computational analysis targeting specific protein interactions, discusses many peptides and proteins, very high impact on the field of peptides.

**7-8: Highly Relevant**
- Directly contributes to peptide research with experimental evidence and computational insights, moderate to high impact on the field of peptides.
- Presents clear implications for the field, suggesting new research directions or therapeutic applications, mentions specific peptides and proteins

**5-6: Moderately Relevant**
- Discusses peptide research with some mention of methodology or protein targets but lacks depth
- Offers suggestions for the field with moderate impact or unclear applications, discusses atleast a few specific peptides and proteins or targets

**3-4: Somewhat Relevant**
- Briefly mentions peptide research without significant detail or depth, doesnt mention specific peptides or proteins by name, unclear impact on peptide research.
- Lacks clear experimental validation or computational analysis relevant to peptides.

**1-2: Irrelevant**
- Minimal or no mention of peptides, lacking relevance to the field of peptide research.



now give a score, always give a score, score only based on the metrics remeber the target audience is researchers who are used to jargon
        <end_of_turn>

        <start_of_turn>model-gemma
        """
        # Process the input with the tokenizer and model
        input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
        outputs = model.generate(**input_ids, max_new_tokens=8000)

        # Extract and return the model's output, removing the initial prompt from the response
        score_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        score_start = score_output.find("model-gemma\n") + len("model-gemma\n")
        score = score_output[score_start:].strip()

        # Append the extracted score to the scores list
        scores.append(score)
        # print(scores[0])

        extraction_input_text = f"""
<start_of_turn>user
give back the score that was given in the following text, just give back the score nothing else:




{scores[0]}

<end_of_turn>
<start_of_turn>model-gemma

"""
        input_ids = tokenizer(extraction_input_text, return_tensors="pt").to("cuda")
        extraction_outputs = model.generate(**input_ids, max_new_tokens=8000)
        extracted_score = tokenizer.decode(
            extraction_outputs[0], skip_special_tokens=True
        )
        extracted_start = extracted_score.find("model-gemma\n") + len("model-gemma\n")
        extracted_score = extracted_score[extracted_start:].strip()

        # Append the extracted score to the scores list and the extracted numerical score to score_values list

        score_values.append(extracted_score)
        # Find all numbers in the extracted score string and convert them to integers
        numbers = [int(num) for num in re.findall(r"\d+", extracted_score)]

        # Check if we have extracted any numbers, then find and append the lowest one
        if numbers:
            lowest_score = min(numbers)
            lowest_scores.append(lowest_score)
        else:
            # In case no numbers were found, append a placeholder or handle as needed
            lowest_scores.append(None)

        if lowest_scores[0] == None:
            lowest_scores[0] = 0

    # Depending on your needs, you can return both the detailed scores and the extracted numerical scores
    return scores[0], score_values[0], lowest_scores[0]