Source code for sciencescraper.pmc.pmc_search

"""
Functions for searching for articles on PubMed Central.
"""

import requests
from bs4 import BeautifulSoup
import time

from .pmc_scrape import get_article_info


[docs] def search_pmc( query, sort="relevance", mindate=None, maxdate=None, reldate=None, retstart=0, retmax=20, ): """ Searches PMC for articles given a query Parameters ---------- query : str The query to search for sort : str, optional The sorting order for the search results. Options are: - "relevance": Sort by relevance - "pub_date": Sort by publication date in descending order - "JournalName": Sort by journal in ascending order - "Author": Sort by first author in ascending order mindate : str, optional The minimum date for the search results. Format is "YYYY/MM/DD", "YYYY/MM", or "YYYY". Must also provide maxdate maxdate : str, optional The maximum date for the search results. Format is "YYYY/MM/DD", "YYYY/MM", or "YYYY". Must also provide mindate reldate : str, optional The number of days to search back from the current date. retstart : int, optional The index of the first article to return retmax : int, optional The maximum number of articles to return Returns ------- pmc_ids : list The PMC IDs of the search results """ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" params = { "db": "pmc", "term": f"{query} AND free fulltext[filter]", "sort": sort, "datetype": "pdat", "retstart": retstart, "retmax": retmax, } if mindate is not None: params["mindate"] = mindate if maxdate is not None: params["maxdate"] = maxdate if reldate is not None: params["reldate"] = reldate response = requests.get(url, params=params) if response.status_code != 200: print(f"Failed to fetch PMC IDs for query {query}") return None soup = BeautifulSoup(response.text, "xml") pmc_ids = [id.text for id in soup.find_all("Id")] return pmc_ids
[docs] def check_new_articles(query, days, chunk_size=None): """ Get open access articles from PubMed Central that have been published after a specified date. Parameters ---------- query : str The query to search for days : int The number of days to search back from the current date. chunk_size : int, optional The size of the chunks to split the full text into Returns ------- pmc_articles : list of dict A list of dictionaries containing article information """ pmc_ids = search_pmc(query, reldate=days) pmc_articles = [] for pmc_id in pmc_ids: pmc_article = get_article_info(pmc_id, chunk_size) pmc_articles.append(pmc_article) # Wait for 1 second to avoid overloading the server time.sleep(1) notify_new_articles(pmc_articles) return pmc_articles
[docs] def notify_new_articles(articles): """ Notify the user of new articles. Parameters ---------- articles : list of dict A list of dictionaries containing the title, authors, journal, year, URL, open access status, keywords, abstract, methods, results, discussion, and references of the new articles. """ if articles: print(f"PubMed Central has {len(articles)} new articles!") else: print("No new articles found.")