Source code for sciencescraper.sciencedirect.scidir_clean

"""
Functions to clean the text extracted from ScienceDirect articles.
"""

from bs4 import BeautifulSoup
from .scidir_extract import (
    get_title,
    get_article_type,
    get_publisher,
    get_journal,
    get_date,
    get_keywords,
    get_abstract,
)


[docs] def clean_fulltext(xml_text, chunk_size): """ Clean the raw XML text of an ScienceDirect article to remove unnecessary information, leaving only the full text of the article. Parameters ---------- xml_text : str The raw XML text of an article. chunk_size : int The size of the chunks to split the full text into. Returns ------- list of str List of the full text of the article, split into chunks """ # Get article information title = "Title: " + get_title(xml_text) + ". \n" article_type = "Article type: " + get_article_type(xml_text) + ". \n" publisher = "Publisher: " + get_publisher(xml_text) + " \n" journal = "Journal: " + get_journal(xml_text) + ". \n" date = "Date: " + get_date(xml_text) + ". \n" keywords = "Keywords: " + ", ".join(get_keywords(xml_text)) + ". \n" soup = BeautifulSoup(xml_text, "xml") # Find all section titles sections = soup.find_all("ce:section-title") # If article not separated into sections if len(sections) == 0: paras = soup.find_all("ce:para") cleaned_text = " ".join(para.text.strip() for para in paras) cleaned_text = ( cleaned_text.replace("\xa0", "").replace(r"\u202", "").replace("\n", "") ) full_text = ( title + article_type + publisher + journal + date + keywords + "Abstract: " + get_abstract(xml_text) + " " + cleaned_text ) else: cleaned_text = "" for section in sections: section_title = section.text.strip() # Skip specified section titles if section_title.lower() in [ "keywords", "supplementary data", "references", "data availability", "star★methods", "key resources table", "supplementary material", ]: continue # Find all paragraphs in the section section_paragraphs = section.find_next_siblings("ce:para") section_text = " ".join(para.text.strip() for para in section_paragraphs) section_text = section_text.replace("\xa0", "") section_text = section_text.replace("\n", "") cleaned_text += section_title + ": " + section_text + "\n" # Combine title, abstract, and cleaned text description_tag = soup.find("dc:description") if description_tag: abstract = " ".join(description_tag.text.strip().split()) else: abstract = "Abstract: Not found.\n" full_text = ( title + article_type + publisher + journal + date + keywords + "Abstract: " + abstract + " " + cleaned_text ) # Split the text into chunks if chunk_size is not None: full_text = split_into_chunks(full_text, chunk_size) return full_text
[docs] def split_into_chunks(text, chunk_size): """ Splits a given text into chunks of approximately 'chunk_size' words. Parameters ---------- text : str The text to split into chunks. chunk_size : int The size of the chunks to split the text into. Returns ------- list of str List of the text split into chunks. """ words = text.split() # Split the text into words chunks = [ " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size) ] return chunks