import re
from typing import Tuple
import pandas as pd
from numpy import int64
from rcx_tk.io import read_file
from rcx_tk.io import save_dataframe_as_tsv
from rcx_tk.utils import replace_spaces
from rcx_tk.utils import validate_filename
[docs]
def process_sequence_file(file_path: str, out_path: str) -> None:
"""Processes a metadata file, keeping and renaming specific columns.
Args:
file_path (str): A path to the metadata file.
out_path (str): A path where processed metadata dataframe is exported.
"""
df = read_file(file_path)
df = process_sequence(df)
save_dataframe_as_tsv(df, out_path)
[docs]
def process_sequence(df: pd.DataFrame) -> pd.DataFrame:
"""Processes the metadata dataframe.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: A metadata dataframe with rearranged and newly derived columns.
"""
df = rearrange_columns(df)
validate_filenames_column(df)
validate_injection_order(df)
df["sampleName"] = df["File name"].apply(replace_spaces)
df = derive_additional_metadata(df)
df = cleanup(df)
return df
[docs]
def cleanup(df: pd.DataFrame) -> pd.DataFrame:
"""Removes the file Name column and moves the sampleName col.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: The processed dataframe.
"""
df = df.drop("File name", axis=1)
column_to_move = df.pop("sampleName")
df.insert(0, "sampleName", column_to_move)
return df
[docs]
def validate_injection_order(df: pd.DataFrame) -> bool:
"""Validates if injectionOrder is of integer type.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
bool: Whether the injectionOrder is integer.
"""
return df["injectionOrder"].dtypes == int64
[docs]
def rearrange_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Rearranges the columns.
Args:
df (pd.DataFrame): The metadata dataframe.
Returns:
pd.DataFrame: The processed dataframe.
"""
columns_to_keep = ["File name", "Type", "Class ID", "Batch", "Analytical order"]
df = df[list(columns_to_keep)]
df = df.rename(
columns={"Type": "sampleType", "Class ID": "class", "Batch": "batch", "Analytical order": "injectionOrder"}
)
return df
[docs]
def validate_filenames_column(df: pd.DataFrame) -> None:
"""Validates the file names.
Args:
df (pd.DataFrame): A dataframe to process.
Raises:
ValueError: An error if there is any invalid file name.
"""
if not df["File name"].apply(validate_filename).all():
raise ValueError("Invalid File name.")
[docs]
def add_local_order(file_name: str) -> int:
"""Returns the localOrder value, i.e. the last n-digits after the last underscore.
Args:
file_name (str): The filename.
Returns:
int: The localOrder value.
"""
_, b = separate_filename(file_name)
return int(b)
[docs]
def add_sequence_identifier(file_name: str) -> str:
"""Returns the sequenceIdentifier value, i.e. everything before last _[digits].
Args:
file_name (str): The filename.
Returns:
str: The sequenceIdentifier value.
"""
a, _ = separate_filename(file_name)
a = a.rstrip("_")
a = a.strip()
return a
[docs]
def separate_filename(file_name: str) -> Tuple[str, str]:
"""Split a filename into the non-numeric prefix and trailing numeric suffix.
Args:
file_name (str): The filename.
Returns:
Tuple[str, str]: Splitted file_name.
"""
a, b = re.findall(r"^(.*\D)(\d+)$", file_name)[0]
return (a, b)
[docs]
def add_subject_identifier(file_name: str) -> str:
"""Returns the subjectIdentifier value, i.e. everything between [digit_] and [_digit].
Args:
file_name (str): The filename.
Returns:
str: The subjectIdentifier value.
"""
_, b, _ = re.findall(r"^(\d+_)(.*?)(_\d+)$", file_name)[0]
b = b.strip()
return b