Source code for peptidedigest.clean_text

"""
Functions to clean text data.
"""

import re


[docs] def split_into_chunks(text, chunk_size): """ Splits a given text into chunks of approximately 'chunk_size' words. Parameters ---------- text : str The text to split into chunks. chunk_size : int The approximate number of words to include in each chunk. Returns ------- chunks : list of str A list of text chunks, each containing approximately 'chunk_size' words. """ words = text.split() # Split the text into words chunks = [ " ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size) ] return chunks
[docs] def clean_summary(summary_text): """ Cleans a summary text by removing unwanted patterns and phrases. Parameters ---------- summary_text : str The summary text to clean. Returns ------- cleaned_summary : str The cleaned summary text. """ # Split the summary text by newlines summary_lines = summary_text.split('\n') # Remove any lines that start with "##" or contain "Summary" cleaned_lines = [line for line in summary_lines if not line.startswith('##') and 'Summary' not in line] cleaned_lines = [line for line in cleaned_lines if 'Sure' not in line and 'Here' not in line] # Join the cleaned lines back into a single string cleaned_summary = '\n'.join(cleaned_lines) # Extended patterns and exact phrases to remove patterns = [ r"Sure, here is a summary of the provided text in \d+ sentences:", r"Sure, here is a \d+-sentence summary of the portion of the scientific article you provided:", r"Sure, here is a summary of the scientific article in \d+ sentences:", r"## Summary of the Scientific Article in \d+ Sentences", r"Sure, here is a \d+-sentence summary of the provided text:", r"Sure. Here is the summary in bullet form:", r"Sure, here is a summary of the text in \d+ sentences", r"Here is a summary of the text in \d+ sentences:", r"Sure, here is a summary of the text you provided in a single paragraph:", r"'Sure, here is a summary in bullet form of a scientific text", r"Chunk \d+" ] # Remove patterns and exact phrases for pattern in patterns: if "## Summary of the Scientific Article in 5 Sentences" in cleaned_summary: cleaned_summary = cleaned_summary.replace( "## Summary of the Scientific Article in 5 Sentences", "" ).strip() else: cleaned_summary = re.sub(pattern, "", cleaned_summary, flags=re.IGNORECASE).strip() return cleaned_summary
[docs] def extract_metadata(metadata_text): """ Extract peptides, proteins, domains of interest, chemistry discussed, biology discussed, and computational methods discussed from the model metadata text. Parameters ---------- metadata_text : str The model metadata text to be parsed. Returns ------- dict A dictionary containing the extracted metadata as lists. """ metadata_dict = { "peptides": None, "proteins": None, "domains": None, "chemistry": None, "biology": None, "computational_methods": None } # Extract peptides peptides_match = re.search( r"\*\*Peptides discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S ) if peptides_match: peptides_text = peptides_match.group(1).strip() if all(keyword not in peptides_text.lower() for keyword in ["not", "any", "n/a", "none", "null"]): peptides_list = [p.strip() for p in peptides_text.split("\n- ")] metadata_dict["peptides"] = peptides_list metadata_dict["peptides"][0] = metadata_dict["peptides"][0].lstrip("- ") # Extract proteins/targets proteins_match = re.search( r"\*\*Proteins/targets discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S ) if proteins_match: proteins_text = proteins_match.group(1).strip() if all(keyword not in proteins_text.lower() for keyword in ["not", "any", "n/a", "none", "null"]): proteins_list = [p.strip() for p in proteins_text.split("\n- ")] metadata_dict["proteins"] = proteins_list metadata_dict["proteins"][0] = metadata_dict["proteins"][0].lstrip("- ") # Extract domains of interest domains_match = re.search( r"\*\*Domains of interest:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S ) if domains_match: domains = domains_match.group(1).strip() if all(keyword not in domains.lower() for keyword in ["not", "any", "n/a", "none", "null"]): domains_list = [d.strip() for d in domains.split("\n- ")] metadata_dict["domains"] = domains_list metadata_dict["domains"][0] = metadata_dict["domains"][0].lstrip("- ") # Extract chemistry discussed chemistry_match = re.search( r"\*\*Chemical matter/chemistry discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S ) if chemistry_match: chemistry = chemistry_match.group(1).strip() if all(keyword not in chemistry.lower() for keyword in ["not", "any", "n/a", "none", "null"]): chemistry_list = [c.strip() for c in chemistry.split("\n- ")] metadata_dict["chemistry"] = chemistry_list metadata_dict["chemistry"][0] = metadata_dict["chemistry"][0].lstrip("- ") # Extract biology discussed biology_match = re.search( r"\*\*Biological matter/biology discussed:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S ) if biology_match: biology = biology_match.group(1).strip() if all(keyword not in biology.lower() for keyword in ["not", "any", "n/a", "none", "null"]): biology_list = [b.strip() for b in biology.split("\n- ")] metadata_dict["biology"] = biology_list metadata_dict["biology"][0] = metadata_dict["biology"][0].lstrip("- ") # Extract computational methods discussed computational_match = re.search( r"\*\*Computational methods:\*\*\n(.*?)(\n\n|\Z)", metadata_text, re.S ) if computational_match: computational = computational_match.group(1).strip() if all(keyword not in computational.lower() for keyword in ["not", "any", "n/a", "none", "null"]): computational_list = [c.strip() for c in computational.split("\n- ")] metadata_dict["computational_methods"] = computational_list metadata_dict["computational_methods"][0] = metadata_dict["computational_methods"][0].lstrip("- ") return metadata_dict