Source code for peptidedigest.article_processing

"""
Functions to process and analyze articles using the model.
"""

import sciencescraper as scraper

from .article_db import insert_article, check_article_exists, update_article
from .model_prompts import (
    summarize_article_segments,
    summarize_article_meta,
    score_texts_peptide_research,
)
from .clean_text import (
    split_into_chunks,
    extract_metadata,
    clean_summary,
)


[docs] def process_scidir_article( database, tokenizer, model, api_key, doi=None, pii=None, url=None, chunk_size=4200, update=False ): """ Process a ScienceDirect article, summarize the article using the model, and store the information in the database. Parameters ---------- database : str The database to store the processed article information. tokenizer : transformers.PreTrainedTokenizer The tokenizer to use for the model. model : transformers.PreTrainedModel The model to use to process the article. api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. doi : str, optional The DOI of the article to be processed. pii : str, optional The PII of the article to be processed. url : str, optional The URL of the article to be processed. chunk_size : int, optional The size of the chunks to split the full text into. Default is 4200. update : bool, optional If True, the article will be updated in the database if it already exists. Default is False. Returns ------- None The processed article information is stored in the database. """ # Check if the article is already in the database if doi is not None: article_exists = check_article_exists(database, doi, "doi") if article_exists and not update: return elif pii is not None: article_exists = check_article_exists(database, pii, "pii") if article_exists and not update: return elif url is not None: article_exists = check_article_exists(database, url, "url") if article_exists and not update: return article_info = scraper.get_scidir_article_info( api_key, doi=doi, pii=pii, url=url, chunk_size=chunk_size ) article_info["scidir/pmc"] = "scidir" article_info["pmc_id"] = None # Check if the article has an abstract if article_info["abstract"] == "Abstract not found in article.": abstract_discussion = article_info["full_text"] else: abstract_discussion = ( article_info["abstract"] + " " + str(article_info["keywords"]) + " " + article_info["methods"] + " " + article_info["discussion"] ) abstract_discussion = split_into_chunks(abstract_discussion, chunk_size) bullet_points, summary = summarize_article_segments( abstract_discussion, tokenizer, model ) metadata = summarize_article_meta(abstract_discussion, tokenizer, model) extracted_data = extract_metadata(metadata) score_justification, score_range, score = score_texts_peptide_research( abstract_discussion, summary, bullet_points, metadata, tokenizer, model ) model_output = { "summary": clean_summary(summary), "bullet_points": clean_summary(bullet_points), "metadata": clean_summary(metadata), "peptides": extracted_data["peptides"], "proteins": extracted_data["proteins"], "domains": extracted_data["domains"], "chemistry": extracted_data["chemistry"], "biology": extracted_data["biology"], "computational_methods": extracted_data["computational_methods"], "score": score, "score_justification": clean_summary(score_justification), } if article_exists: update_article(database, doi, model_output) else: insert_article(database, article_info, model_output)
[docs] def process_multiple_scidir_articles( database, tokenizer, model, api_key, dois=None, piis=None, urls=None, chunk_size=4200, update=False, ): """ Process multiple ScienceDirect articles, summarize the articles using the model, and store the information in the database. Parameters ---------- database : str The database to store the processed articles information. tokenizer : transformers.PreTrainedTokenizer The tokenizer to use for the model. model : transformers.PreTrainedModel The model to use to process the articles. api_key : str The API key for the ScienceDirect API. API keys can be obtained by creating an account at https://dev.elsevier.com/. dois : list of str, optional The DOIs of the articles to be processed. piis : list of str, optional The PIIs of the articles to be processed. urls : list of str, optional The URLs of the articles to be processed. chunk_size : int, optional The size of the chunks to split the full text into. Default is 4200. update : bool, optional If True, the articles will be updated in the database if they already exist. Default is False. Returns ------- None The processed articles information is stored in the database. """ if dois is not None: for doi in dois: process_scidir_article( database, tokenizer, model, api_key, doi=doi, chunk_size=chunk_size, update=update ) if piis is not None: for pii in piis: process_scidir_article( database, tokenizer, model, api_key, pii=pii, chunk_size=chunk_size, update=update ) if urls is not None: for url in urls: process_scidir_article( database, tokenizer, model, api_key, url=url, chunk_size=chunk_size, update=update )
[docs] def process_pmc_article(database, tokenizer, model, pmc_id, chunk_size=4200, update=False): """ Process a PubMed Central article, summarize the article using the model, and store the information in the database. Parameters ---------- database : str The database to store the processed article information. tokenizer : transformers.PreTrainedTokenizer The tokenizer to use for the model. model : transformers.PreTrainedModel The model to use to process the article. pmc_id : str The PMC ID of the article to be processed. chunk_size : int, optional The size of the chunks to split the full text into. Default is 4200. update : bool, optional If True, the article will be updated in the database if it already exists. Default is False. Returns ------- None The processed article information is stored in the database. """ article_exists = check_article_exists(database, pmc_id, "pmc_id") if article_exists and not update: return article_info = scraper.get_pmc_article_info(pmc_id, chunk_size=chunk_size) if article_info == None: return article_info["scidir/pmc"] = "pmc" article_info["pmc_id"] = pmc_id if ( article_info["abstract"] == "" or article_info["methods"] == "" or article_info["discussion"] == "" ): model_input = article_info["full_text"] else: abstract_discussion = ( article_info["abstract"] + " " + str(article_info["keywords"]) + " " + article_info["methods"] + " " + article_info["discussion"] ) model_input = split_into_chunks(abstract_discussion, chunk_size) bullet_points, summary = summarize_article_segments(model_input, tokenizer, model) metadata = summarize_article_meta(model_input, tokenizer, model) extracted_data = extract_metadata(metadata) score_justification, score_range, score = score_texts_peptide_research( model_input, summary, bullet_points, metadata, tokenizer, model ) model_output = { "summary": clean_summary(summary), "bullet_points": clean_summary(bullet_points), "metadata": clean_summary(metadata), "peptides": extracted_data["peptides"], "proteins": extracted_data["proteins"], "domains": extracted_data["domains"], "chemistry": extracted_data["chemistry"], "biology": extracted_data["biology"], "computational_methods": extracted_data["computational_methods"], "score": score, "score_justification": clean_summary(score_justification), } if article_exists: update_article(database, article_info["doi"], model_output) else: insert_article(database, article_info, model_output)
[docs] def process_multiple_pmc_articles(database, tokenizer, model, pmc_ids, chunk_size=4200, update=False): """ Process multiple PubMed Central articles, summarize the articles using the model, and store the information in the database. Parameters ---------- database : str The database to store the processed articles information. tokenizer : transformers.PreTrainedTokenizer The tokenizer to use for the model. model : transformers.PreTrainedModel The model to use to process the articles. pmc_ids : list of str The PMC IDs of the articles to be processed. chunk_size : int, optional The size of the chunks to split the full text into. Default is 4200. update : bool, optional If True, the articles will be updated in the database if they already exist. Default is False. Returns ------- None The processed articles information is stored in the database. """ for pmc_id in pmc_ids: process_pmc_article(database, tokenizer, model, pmc_id, chunk_size=chunk_size, update=update)