Source code for sciencescraper.pmc.pmc_clean

"""
Functions to clean the data extracted from PMC.
"""

from .pmc_extract import (
    get_title,
    get_journal,
    get_publisher,
    get_article_type,
    get_date,
    get_keywords,
    get_abstract,
    clean_references,
)



[docs]
def clean_full_text(pmc_article, chunk_size):
    """
    Returns the full text of the article, excluding figures, tables,
    and supplementary information sections, and removes reference numbers

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    chunk_size : int
        The size of the chunks to split the full text into

    Returns
    -------
    full_text : str
        The full text of the article, excluding figures, tables,
        supplementary information sections, and reference numbers
    """
    full_text = ""

    article_title = "Title: " + get_title(pmc_article) + ". \n"
    article_type = "Article type: " + get_article_type(pmc_article) + ". \n"
    publisher = "Publisher: " + get_publisher(pmc_article) + " \n"
    journal = "Journal: " + get_journal(pmc_article) + ". \n"
    date = "Date: " + get_date(pmc_article) + ". \n"
    keywords = "Keywords: " + ", ".join(get_keywords(pmc_article)) + ". \n"
    abstract = "Abstract: " + get_abstract(pmc_article) + ". \n"

    full_text += (
        article_title + article_type + publisher + journal + date + keywords + abstract
    )
    
    if pmc_article.find("body") is None:
        return None

    body = pmc_article.find("body")

    for fig in body.find_all("fig"):
        fig.decompose()

    for table in body.find_all("table-wrap"):
        table.decompose()

    for sec in body.find_all("sec"):
        title_tag = sec.find("title")
        if title_tag is None:  # Check if title tag exists
            continue
        title = title_tag.get_text()

        unwanted_sections = [
            "Supplementary",
            "Online content",
            "Source data",
            "Reporting summary",
        ]

        # Skip supplementary information sections
        if any(section in title for section in unwanted_sections):
            continue

        # Remove reference numbers
        for ref in sec.find_all("xref"):
            ref.decompose()

        content = sec.find_all("p")
        section_content = ""
        for para in content:
            section_content += para.get_text(separator=" ") + " "

        # Clean up text
        section_content = section_content.replace("−", "")
        section_content = " ".join(section_content.split())

        # Format the subtitle with a colon
        formatted_title = f"{ title}:"

        full_text += f"{formatted_title} {section_content}\n\n"

        full_text = clean_references(full_text)

    if chunk_size:
        full_text = split_into_chunks(full_text, chunk_size)

    return full_text





[docs]
def split_into_chunks(text, chunk_size):
    """
    Splits a given text into chunks of approximately 'chunk_size' words.

    Parameters
    ----------
    text : str
        The text to split into chunks.

    chunk_size : int
        The size of the chunks to split the text into.

    Returns
    -------
    list of str
        List of the text split into chunks.
    """
    words = text.split()  # Split the text into words
    chunks = [
        " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size)
    ]
    return chunks