Source code for sciencescraper.pmc.pmc_clean

"""
Functions to clean the data extracted from PMC.
"""

from .pmc_extract import (
    get_title,
    get_journal,
    get_publisher,
    get_article_type,
    get_date,
    get_keywords,
    get_abstract,
    clean_references,
)


[docs] def clean_full_text(pmc_article, chunk_size): """ Returns the full text of the article, excluding figures, tables, and supplementary information sections, and removes reference numbers Parameters ---------- pmc_article : BeautifulSoup The article as a BeautifulSoup object chunk_size : int The size of the chunks to split the full text into Returns ------- full_text : str The full text of the article, excluding figures, tables, supplementary information sections, and reference numbers """ full_text = "" article_title = "Title: " + get_title(pmc_article) + ". \n" article_type = "Article type: " + get_article_type(pmc_article) + ". \n" publisher = "Publisher: " + get_publisher(pmc_article) + " \n" journal = "Journal: " + get_journal(pmc_article) + ". \n" date = "Date: " + get_date(pmc_article) + ". \n" keywords = "Keywords: " + ", ".join(get_keywords(pmc_article)) + ". \n" abstract = "Abstract: " + get_abstract(pmc_article) + ". \n" full_text += ( article_title + article_type + publisher + journal + date + keywords + abstract ) if pmc_article.find("body") is None: return None body = pmc_article.find("body") for fig in body.find_all("fig"): fig.decompose() for table in body.find_all("table-wrap"): table.decompose() for sec in body.find_all("sec"): title_tag = sec.find("title") if title_tag is None: # Check if title tag exists continue title = title_tag.get_text() unwanted_sections = [ "Supplementary", "Online content", "Source data", "Reporting summary", ] # Skip supplementary information sections if any(section in title for section in unwanted_sections): continue # Remove reference numbers for ref in sec.find_all("xref"): ref.decompose() content = sec.find_all("p") section_content = "" for para in content: section_content += para.get_text(separator=" ") + " " # Clean up text section_content = section_content.replace("−", "") section_content = " ".join(section_content.split()) # Format the subtitle with a colon formatted_title = f"{ title}:" full_text += f"{formatted_title} {section_content}\n\n" full_text = clean_references(full_text) if chunk_size: full_text = split_into_chunks(full_text, chunk_size) return full_text
[docs] def split_into_chunks(text, chunk_size): """ Splits a given text into chunks of approximately 'chunk_size' words. Parameters ---------- text : str The text to split into chunks. chunk_size : int The size of the chunks to split the text into. Returns ------- list of str List of the text split into chunks. """ words = text.split() # Split the text into words chunks = [ " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size) ] return chunks