Source code for peptidedigest.clean_text

"""
Functions to clean text data.
"""

import re



[docs]
def split_into_chunks(text, chunk_size):
    """
    Splits a given text into chunks of approximately 'chunk_size' words.

    Parameters
    ----------
    text : str
        The text to split into chunks.
    chunk_size : int
        The approximate number of words to include in each chunk.

    Returns
    -------
    chunks : list of str
        A list of text chunks, each containing approximately 'chunk_size' words.
    """
    words = text.split()  # Split the text into words
    chunks = [
        " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size)
    ]
    return chunks




[docs]
def clean_summary(summary_text):
    """
    Cleans a summary text by removing unwanted patterns and phrases.

    Parameters
    ----------
    summary_text : str
        The summary text to clean.

    Returns
    -------
    cleaned_summary : str
        The cleaned summary text.
    """
    # Split the summary text by newlines
    summary_lines = summary_text.split('\n')

    # Remove any lines that start with "##" or contain "Summary"
    cleaned_lines = [line for line in summary_lines if not line.startswith('##') and 'Summary' not in line]
    cleaned_lines = [line for line in cleaned_lines if 'Sure' not in line and 'Here' not in line]

    # Join the cleaned lines back into a single string
    cleaned_summary = '\n'.join(cleaned_lines)

    # Extended patterns and exact phrases to remove
    patterns = [
        r"Sure, here is a summary of the provided text in \d+ sentences:",
        r"Sure, here is a \d+-sentence summary of the portion of the scientific article you provided:",
        r"Sure, here is a summary of the scientific article in \d+ sentences:",
        r"## Summary of the Scientific Article in \d+ Sentences",
        r"Sure, here is a \d+-sentence summary of the provided text:",
        r"Sure. Here is the summary in bullet form:",
        r"Sure, here is a summary of the text in \d+ sentences",
        r"Here is a summary of the text in \d+ sentences:",
        r"Sure, here is a summary of the text you provided in a single paragraph:",
        r"'Sure, here is a summary in bullet form of a scientific text",
        r"Chunk \d+"
    ]

    # Remove patterns and exact phrases
    for pattern in patterns:
        if "## Summary of the Scientific Article in 5 Sentences" in cleaned_summary:
            cleaned_summary = cleaned_summary.replace(
                "## Summary of the Scientific Article in 5 Sentences", ""
            ).strip()
        else:
            cleaned_summary = re.sub(pattern, "", cleaned_summary, flags=re.IGNORECASE).strip()

    return cleaned_summary




[docs]
def extract_metadata(metadata_text):
    """
    Extract peptides, proteins, domains of interest, chemistry discussed,
    biology discussed, and computational methods discussed from the model metadata text.

    Parameters
    ----------
    metadata_text : str
        The model metadata text to be parsed.

    Returns
    -------
    dict
        A dictionary containing the extracted metadata as lists.
    """
    metadata_dict = {
        "peptides": None,
        "proteins": None,
        "domains": None,
        "chemistry": None,
        "biology": None,
        "computational_methods": None
    }

    # Extract peptides
    peptides_match = re.search(
        r"\*\*Peptides discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S
    )
    if peptides_match:
        peptides_text = peptides_match.group(1).strip()
        if all(keyword not in peptides_text.lower() for keyword in ["not", "any", "n/a", "none", "null"]):
            peptides_list = [p.strip() for p in peptides_text.split("\n- ")]
            metadata_dict["peptides"] = peptides_list
            metadata_dict["peptides"][0] = metadata_dict["peptides"][0].lstrip("- ")

    # Extract proteins/targets
    proteins_match = re.search(
        r"\*\*Proteins/targets discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S
    )
    if proteins_match:
        proteins_text = proteins_match.group(1).strip()
        if all(keyword not in proteins_text.lower() for keyword in ["not", "any", "n/a", "none", "null"]):
            proteins_list = [p.strip() for p in proteins_text.split("\n- ")]
            metadata_dict["proteins"] = proteins_list
            metadata_dict["proteins"][0] = metadata_dict["proteins"][0].lstrip("- ")

    # Extract domains of interest
    domains_match = re.search(
        r"\*\*Domains of interest:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S
    )
    if domains_match:
        domains = domains_match.group(1).strip()
        if all(keyword not in domains.lower() for keyword in ["not", "any", "n/a", "none", "null"]):
            domains_list = [d.strip() for d in domains.split("\n- ")]
            metadata_dict["domains"] = domains_list
            metadata_dict["domains"][0] = metadata_dict["domains"][0].lstrip("- ")

    # Extract chemistry discussed
    chemistry_match = re.search(
        r"\*\*Chemical matter/chemistry discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S
    )
    if chemistry_match:
        chemistry = chemistry_match.group(1).strip()
        if all(keyword not in chemistry.lower() for keyword in ["not", "any", "n/a", "none", "null"]):
            chemistry_list = [c.strip() for c in chemistry.split("\n- ")]
            metadata_dict["chemistry"] = chemistry_list
            metadata_dict["chemistry"][0] = metadata_dict["chemistry"][0].lstrip("- ")

    # Extract biology discussed
    biology_match = re.search(
        r"\*\*Biological matter/biology discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S
    )
    if biology_match:
        biology = biology_match.group(1).strip()
        if all(keyword not in biology.lower() for keyword in ["not", "any", "n/a", "none", "null"]):
            biology_list = [b.strip() for b in biology.split("\n- ")]
            metadata_dict["biology"] = biology_list
            metadata_dict["biology"][0] = metadata_dict["biology"][0].lstrip("- ")

    # Extract computational methods discussed
    computational_match = re.search(
        r"\*\*Computational methods:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S
    )
    if computational_match:
        computational = computational_match.group(1).strip()
        if all(keyword not in computational.lower() for keyword in ["not", "any", "n/a", "none", "null"]):
            computational_list = [c.strip() for c in computational.split("\n- ")]
            metadata_dict["computational_methods"] = computational_list
            metadata_dict["computational_methods"][0] = metadata_dict["computational_methods"][0].lstrip("- ")

    return metadata_dict