Source code for sciencescraper.pmc.pmc_extract

"""
Functions that extract information from the raw text of PubMed Central articles.
"""

import re



[docs]
def get_title(pmc_article):
    """
    Returns the title of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    title : str
        The title of the article
    """
    if pmc_article.find("article-title") is None:
        return None
    title = pmc_article.find("article-title").text
    title = title.replace("\n", "").replace("\t", "")
    return title




[docs]
def get_authors(pmc_article):
    """
    Returns the authors of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    authors : list
        The authors of the article
    """
    authors = []
    for author in pmc_article.find_all("contrib"):
        if author.find("name") is not None:
            surname = (
                author.find("surname").text.strip() if author.find("surname") else ""
            )
            given_names = (
                author.find("given-names").text.strip()
                if author.find("given-names")
                else ""
            )
            full_name = f"{given_names} {surname}".strip()
            authors.append(full_name)
    return authors




[docs]
def get_journal(pmc_article):
    """
    Returns the journal of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    journal : str
        The journal of the article
    """
    if pmc_article.find("journal-title") is None:
        return None
    return pmc_article.find("journal-title").text




[docs]
def get_publisher(pmc_article):
    """
    Returns the publisher of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    publisher : str
        The publisher of the article
    """
    if pmc_article.find("publisher-name") is None:
        return None
    return pmc_article.find("publisher-name").text




[docs]
def get_article_type(pmc_article):
    """
    Returns the article type of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    article_type : str
        The article type of the article
    """
    if pmc_article.find("article")["article-type"] is None:
        return None
    return pmc_article.find("article")["article-type"]




[docs]
def get_doi(pmc_article):
    """
    Returns the DOI of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    doi : str
        The DOI of the article
    """
    if pmc_article.find("article-id", {"pub-id-type": "doi"}) is None:
        return None
    return pmc_article.find("article-id", {"pub-id-type": "doi"}).text




[docs]
def get_pmc_id(pmc_article):
    """
    Returns the PMC ID of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    pmc_id : str
        The PMC ID of the article
    """
    if pmc_article.find("article-id", {"pub-id-type": "pmc"}) is None:
        return None
    return pmc_article.find("article-id", {"pub-id-type": "pmc"}).text




[docs]
def get_date(pmc_article):
    """
    Returns the date of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    date : str
        The date of the article
    """
    if pmc_article.find("pub-date") is None:
        return None
    pub_date = pmc_article.find("pub-date")
    day = pub_date.find("day").text if pub_date.find("day") else ""
    month = pub_date.find("month").text if pub_date.find("month") else ""
    year = pub_date.find("year").text if pub_date.find("year") else ""
    date = f"{year}-{month}-{day}"
    return date




[docs]
def get_url(pmc_article):
    """
    Returns the URL of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    url : str
        The URL of the article
    """
    pmc_id = get_pmc_id(pmc_article)
    if pmc_id is None:
        return None
    return f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"




[docs]
def get_keywords(pmc_article):
    """
    Returns the keywords of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    keywords : list
        The keywords of the article
    """
    keywords = []
    for keyword in pmc_article.find_all("kwd"):
        keywords.append(keyword.text)
    return keywords




[docs]
def get_abstract(pmc_article):
    """
    Returns the abstract of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    abstract : str
        The abstract of the article
    """
    if pmc_article.find("abstract") is None:
        return None
    abstract = pmc_article.find("abstract").text
    # clean up text
    abstract = abstract.replace("−", "")
    abstract = " ".join(abstract.split())
    abstract = clean_references(abstract)
    return abstract




[docs]
def get_intro(pmc_article):
    """
    Returns the introduction section of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    intro : str
        The introduction section of the article
    """
    if pmc_article.find("body") is None:
        return None

    body = pmc_article.find("body")

    for fig in body.find_all("fig"):
        fig.decompose()

    for table in body.find_all("table-wrap"):
        table.decompose()

    intro = ""

    for sec in body.find_all("sec"):
        title_tag = sec.find("title")
        if title_tag is None:  # Check if title tag exists
            continue
        title = title_tag.get_text()

        if "introduction" in title.lower():
            content = sec.find_all("p")
            section_content = ""
            for para in content:
                section_content += para.get_text(separator=" ") + " "

            # Clean up text
            section_content = section_content.replace("−", "")
            section_content = " ".join(section_content.split())

            intro += f"{section_content}"

    intro = clean_references(intro)
    return intro




[docs]
def get_methods(pmc_article):
    """
    Returns the methods section of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    methods : str
        The methods section of the article
    """
    if pmc_article.find("body") is None:
        return None

    body = pmc_article.find("body")

    for fig in body.find_all("fig"):
        fig.decompose()

    for table in body.find_all("table-wrap"):
        table.decompose()

    methods = ""

    for sec in body.find_all("sec"):
        title_tag = sec.find("title")
        if title_tag is None:  # Check if title tag exists
            continue
        title = title_tag.get_text()

        if "method" in title.lower():
            content = sec.find_all("p")
            section_content = ""
            for para in content:
                section_content += para.get_text(separator=" ") + " "

            # Clean up text
            section_content = section_content.replace("−", "")
            section_content = " ".join(section_content.split())

            methods += f"{section_content}\n\n"

    methods = clean_references(methods)
    return methods




[docs]
def get_discussion(pmc_article):
    """
    Returns the discussion section of the article

    Parameters
    ----------
    pmc_article : BeautifulSoup
        The article as a BeautifulSoup object

    Returns
    -------
    discussion : str
        The discussion section of the article
    """
    if pmc_article.find("body") is None:
        return None

    body = pmc_article.find("body")

    for fig in body.find_all("fig"):
        fig.decompose()

    for table in body.find_all("table-wrap"):
        table.decompose()

    discussion = ""

    for sec in body.find_all("sec"):
        title_tag = sec.find("title")
        if title_tag is None:  # Check if title tag exists
            continue
        title = title_tag.get_text()

        if "discussion" in title.lower():
            content = sec.find_all("p")
            section_content = ""
            for para in content:
                section_content += para.get_text(separator=" ") + " "

            # Clean up text
            section_content = section_content.replace("−", "")
            section_content = " ".join(section_content.split())

            discussion += f"{section_content}\n\n"

    discussion = clean_references(discussion)
    return discussion





[docs]
def clean_references(pmc_article):
    """
    Removes the reference numbers from the article which can clutter the text in terms of
    readability and analysis by a machine learning model.
    """
    pattern = r"\[[\d\s,]+\]"
    cleaned_text = re.sub(pattern, "", pmc_article)
    return cleaned_text