Skip to content

I/O — File Format Parsers

spectrakit.io.read_jcamp

read_jcamp(path: str | Path) -> Spectrum

Read a JCAMP-DX file and return a Spectrum.

Parses ##XYDATA=(X++(Y..Y)) format. Supports AFFN (ASCII free-format numeric) encoding.

Parameters:

Name Type Description Default
path str | Path

Path to the .dx / .jdx / .jcamp file.

required

Returns:

Type Description
Spectrum

Spectrum with intensities shape (W,) and wavenumbers shape (W,).

Raises:

Type Description
FileNotFoundError

If path does not exist.

ValueError

If the file cannot be parsed.

Source code in src/spectrakit/io/jcamp.py
def read_jcamp(path: str | Path) -> Spectrum:
    """Read a JCAMP-DX file and return a Spectrum.

    Parses ##XYDATA=(X++(Y..Y)) format. Supports AFFN (ASCII free-format
    numeric) encoding.

    Args:
        path: Path to the .dx / .jdx / .jcamp file.

    Returns:
        Spectrum with intensities shape (W,) and wavenumbers shape (W,).

    Raises:
        FileNotFoundError: If path does not exist.
        ValueError: If the file cannot be parsed.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"JCAMP file not found: {path}")

    validate_file_size(path.stat().st_size, path_name=str(path))

    metadata: dict[str, str] = {}
    x_values: list[float] = []
    y_values: list[float] = []
    in_xydata = False

    with open(path, encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            ldr_match = _LDR_PATTERN.match(line)
            if ldr_match:
                key = ldr_match.group(1).strip().upper()
                value = ldr_match.group(2).strip()

                if key == "XYDATA":
                    in_xydata = True
                    continue
                elif key == "END":
                    in_xydata = False
                    continue
                else:
                    metadata[key] = value
                    in_xydata = False
                    continue

            if in_xydata:
                numbers = _AFFN_NUMBER.findall(line)
                if len(numbers) >= 2:
                    x_values.append(float(numbers[0]))
                    for y_str in numbers[1:]:
                        y_values.append(float(y_str))

    if not y_values:
        raise FileFormatError(f"No XYDATA found in {path}")

    first_x = float(metadata.get("FIRSTX", str(x_values[0])))
    last_x = float(metadata.get("LASTX", str(x_values[-1])))
    n_points = len(y_values)

    wavenumbers = np.linspace(first_x, last_x, n_points)
    intensities = np.array(y_values, dtype=np.float64)

    logger.debug("Read JCAMP: %d points from %s", n_points, path.name)

    return Spectrum(
        intensities=intensities,
        wavenumbers=wavenumbers,
        metadata=metadata,
        source_format="jcamp",
        label=path.stem,
    )

spectrakit.io.read_spc

read_spc(path: str | Path) -> Spectrum

Read a Galactic SPC file and return a Spectrum.

Requires the spc-spectra package (install via pip install spectrakit[io]).

Parameters:

Name Type Description Default
path str | Path

Path to the .spc file.

required

Returns:

Type Description
Spectrum

Spectrum with intensities shape (W,) or (N, W) for multi-trace files.

Raises:

Type Description
ImportError

If spc-spectra is not installed.

FileNotFoundError

If path does not exist.

Source code in src/spectrakit/io/spc.py
def read_spc(path: str | Path) -> Spectrum:
    """Read a Galactic SPC file and return a Spectrum.

    Requires the ``spc-spectra`` package (install via
    ``pip install spectrakit[io]``).

    Args:
        path: Path to the .spc file.

    Returns:
        Spectrum with intensities shape (W,) or (N, W) for multi-trace files.

    Raises:
        ImportError: If spc-spectra is not installed.
        FileNotFoundError: If path does not exist.
    """
    try:
        import spc
    except ImportError as e:
        raise DependencyError(
            "spc-spectra is required for SPC files. Install with: pip install spectrakit[io]"
        ) from e

    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"SPC file not found: {path}")

    validate_file_size(path.stat().st_size, path_name=str(path))

    f = spc.File(str(path))

    if f.fnsub == 1:
        wavenumbers = np.array(f.x, dtype=np.float64)
        intensities = np.array(f.sub[0].y, dtype=np.float64)
    else:
        wavenumbers = np.array(f.x, dtype=np.float64)
        intensities = np.array([sub.y for sub in f.sub], dtype=np.float64)

    metadata = {
        "fnsub": f.fnsub,
        "fexper": getattr(f, "fexper", ""),
    }

    logger.debug("Read SPC: %s sub-spectra from %s", f.fnsub, path.name)

    return Spectrum(
        intensities=intensities,
        wavenumbers=wavenumbers,
        metadata=metadata,
        source_format="spc",
        label=path.stem,
    )

spectrakit.io.read_csv

read_csv(
    path: str | Path,
    delimiter: str = ",",
    x_column: int = 0,
    y_column: int = 1,
    skip_header: int = 0,
    orientation: Literal["columns", "rows"] = "columns",
) -> Spectrum

Read spectral data from a CSV or TSV file.

Parameters:

Name Type Description Default
path str | Path

Path to the CSV file.

required
delimiter str

Column separator. Use "\t" for TSV.

','
x_column int

Index of the wavenumber/wavelength column. Set to -1 to indicate no x-axis column (y data only).

0
y_column int

Index of the intensity column (for single-spectrum files). Ignored when orientation="rows".

1
skip_header int

Number of header lines to skip.

0
orientation Literal['columns', 'rows']

"columns" means each column is a variable (x, y1, y2...); "rows" means each row is a full spectrum.

'columns'

Returns:

Type Description
Spectrum

Spectrum with intensities and optional wavenumbers.

Source code in src/spectrakit/io/csv.py
def read_csv(
    path: str | Path,
    delimiter: str = ",",
    x_column: int = 0,
    y_column: int = 1,
    skip_header: int = 0,
    orientation: Literal["columns", "rows"] = "columns",
) -> Spectrum:
    """Read spectral data from a CSV or TSV file.

    Args:
        path: Path to the CSV file.
        delimiter: Column separator. Use "\\t" for TSV.
        x_column: Index of the wavenumber/wavelength column. Set to -1
            to indicate no x-axis column (y data only).
        y_column: Index of the intensity column (for single-spectrum files).
            Ignored when orientation="rows".
        skip_header: Number of header lines to skip.
        orientation: "columns" means each column is a variable (x, y1, y2...);
            "rows" means each row is a full spectrum.

    Returns:
        Spectrum with intensities and optional wavenumbers.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"CSV file not found: {path}")

    validate_file_size(path.stat().st_size, path_name=str(path))

    data = np.genfromtxt(
        path,
        delimiter=delimiter,
        skip_header=skip_header,
        dtype=np.float64,
    )

    if data.ndim == 1:
        return Spectrum(
            intensities=data,
            source_format="csv",
            label=path.stem,
        )

    wavenumbers: np.ndarray | None = None

    if orientation == "columns":
        if x_column >= 0:
            wavenumbers = data[:, x_column]
            y_cols = [i for i in range(data.shape[1]) if i != x_column]
            if len(y_cols) == 1:
                intensities = data[:, y_cols[0]]
            else:
                intensities = data[:, y_cols].T  # (N, W)
        else:
            if y_column >= 0 and data.shape[1] > 1:
                intensities = data[:, y_column]
            else:
                intensities = data
    else:
        if x_column >= 0:
            wavenumbers = data[x_column, :]
            intensities = np.delete(data, x_column, axis=0)
        else:
            intensities = data

    logger.debug("Read CSV: shape %s from %s", intensities.shape, path.name)

    return Spectrum(
        intensities=intensities,
        wavenumbers=wavenumbers,
        source_format="csv",
        label=path.stem,
    )

spectrakit.io.read_opus

read_opus(path: str | Path) -> Spectrum

Read a Bruker OPUS binary file and return a Spectrum.

Parses the OPUS binary format natively without external dependencies. Extracts the absorbance/transmittance spectrum, wavenumber axis (computed from FXV/LXV parameters), and metadata.

Parameters:

Name Type Description Default
path str | Path

Path to the OPUS file (.0, .1, .2, etc.).

required

Returns:

Type Description
Spectrum

Spectrum with intensities shape (W,) and wavenumbers (W,).

Raises:

Type Description
FileNotFoundError

If path does not exist.

FileFormatError

If the file cannot be parsed as valid OPUS.

Source code in src/spectrakit/io/opus.py
def read_opus(path: str | Path) -> Spectrum:
    """Read a Bruker OPUS binary file and return a Spectrum.

    Parses the OPUS binary format natively without external dependencies.
    Extracts the absorbance/transmittance spectrum, wavenumber axis
    (computed from FXV/LXV parameters), and metadata.

    Args:
        path: Path to the OPUS file (.0, .1, .2, etc.).

    Returns:
        Spectrum with intensities shape ``(W,)`` and wavenumbers ``(W,)``.

    Raises:
        FileNotFoundError: If *path* does not exist.
        FileFormatError: If the file cannot be parsed as valid OPUS.
    """
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"OPUS file not found: {path}")

    try:
        raw = path.read_bytes()
    except OSError as exc:
        raise FileFormatError(f"Cannot read OPUS file: {exc}") from exc

    validate_file_size(len(raw), path_name=str(path))

    if len(raw) < _MIN_FILE_SIZE:
        raise FileFormatError(f"File too small to be OPUS format ({len(raw)} bytes).")

    # ── Parse block directory ───────────────────────────────────────
    try:
        entries = _parse_directory(raw)
    except FileFormatError:
        raise
    except (struct.error, IndexError, TypeError, ValueError) as exc:  # pragma: no cover
        raise FileFormatError(f"Failed to parse OPUS directory: {exc}") from exc

    # ── Locate data and parameter blocks ────────────────────────────
    # Strategy: look for AB (absorbance) data first, then fall back to
    # single-channel sample, then single-channel reference.
    data_type_priority = [
        _DATA_TYPE_AB,
        _DATA_TYPE_SC_SAMPLE,
        _DATA_TYPE_SC_REF,
    ]
    param_type_priority = [
        _PARAM_DATA_STATUS,
        _PARAM_SC_STATUS,
        _PARAM_RF_STATUS,
    ]

    data_block: tuple[int, int, int] | None = None
    param_block: tuple[int, int, int] | None = None

    for data_type, param_type in zip(data_type_priority, param_type_priority, strict=True):
        data_candidates = _find_blocks_by_type(entries, data_type)
        param_candidates = _find_blocks_by_type(entries, param_type)
        if data_candidates and param_candidates:
            data_block = data_candidates[0]
            param_block = param_candidates[0]
            logger.debug(
                "Using data block type=0x%02X, param block type=0x%02X",
                data_type,
                param_type,
            )
            break

    # If we found data but no paired parameter block, try any parameter block
    if data_block is None:
        for data_type in data_type_priority:
            data_candidates = _find_blocks_by_type(entries, data_type)
            if data_candidates:
                data_block = data_candidates[0]
                break

    if data_block is None:
        raise FileFormatError(
            "No spectral data block found in OPUS file. "
            "Searched for AB, single-channel sample, and reference blocks."
        )

    # Try all parameter block types if we don't have one yet
    if param_block is None:
        for param_type in param_type_priority:
            param_candidates = _find_blocks_by_type(entries, param_type)
            if param_candidates:
                param_block = param_candidates[0]
                break

    if param_block is None:
        raise FileFormatError(
            "No data parameter block found in OPUS file. Cannot determine NPT, FXV, LXV."
        )

    # ── Extract spectral parameters (NPT, FXV, LXV) ────────────────
    _, param_len, param_off = param_block
    try:
        params = _parse_parameter_block(raw, param_off, param_len)
    except (struct.error, IndexError, UnicodeDecodeError, ValueError) as exc:  # pragma: no cover
        raise FileFormatError(f"Failed to parse OPUS parameter block: {exc}") from exc

    n_points = params.get("NPT")
    first_x = params.get("FXV")
    last_x = params.get("LXV")

    if n_points is None:
        raise FileFormatError("NPT (number of points) not found in OPUS parameter block.")
    n_points = int(n_points)

    if n_points <= 0:
        raise FileFormatError(f"Invalid NPT value in OPUS file: {n_points}")

    # ── Read spectral data ──────────────────────────────────────────
    _, _data_len, data_off = data_block
    try:
        intensities = _read_float32_block(raw, data_off, n_points)
    except FileFormatError:
        raise
    except (struct.error, ValueError, TypeError, IndexError) as exc:  # pragma: no cover
        raise FileFormatError(f"Failed to read OPUS data block: {exc}") from exc

    # ── Build wavenumber axis ───────────────────────────────────────
    wavenumbers: np.ndarray | None = None
    if first_x is not None and last_x is not None:
        wavenumbers = np.linspace(float(first_x), float(last_x), n_points, dtype=np.float64)

    # ── Collect metadata from instrument/sample parameter blocks ────
    metadata: dict[str, Any] = {}
    metadata_block_types = [_PARAM_INSTRUMENT, _PARAM_SAMPLE]
    for mtype in metadata_block_types:
        for _, mlen, moff in _find_blocks_by_type(entries, mtype):
            try:
                block_params = _parse_parameter_block(raw, moff, mlen)
                metadata.update(block_params)
            except (struct.error, ValueError, IndexError) as exc:  # pragma: no cover
                logger.debug("Could not parse metadata block at offset %d: %s", moff, exc)

    # Also include the data parameters in metadata
    metadata.update(params)

    logger.debug("Read OPUS: %d points from %s", n_points, path.name)

    return Spectrum(
        intensities=intensities,
        wavenumbers=wavenumbers,
        metadata=metadata,
        source_format="opus",
        label=path.stem,
    )

spectrakit.io.read_hdf5

read_hdf5(
    path: str | Path,
    intensities_key: str = "intensities",
    wavenumbers_key: str = "wavenumbers",
) -> Spectrum

Read spectral data from an HDF5 file.

Parameters:

Name Type Description Default
path str | Path

Path to the .h5 / .hdf5 file.

required
intensities_key str

Dataset key for intensity values.

'intensities'
wavenumbers_key str

Dataset key for wavenumber values.

'wavenumbers'

Returns:

Type Description
Spectrum

Spectrum loaded from the HDF5 datasets.

Raises:

Type Description
ImportError

If h5py is not installed.

FileNotFoundError

If path does not exist.

Source code in src/spectrakit/io/hdf5.py
def read_hdf5(
    path: str | Path,
    intensities_key: str = "intensities",
    wavenumbers_key: str = "wavenumbers",
) -> Spectrum:
    """Read spectral data from an HDF5 file.

    Args:
        path: Path to the .h5 / .hdf5 file.
        intensities_key: Dataset key for intensity values.
        wavenumbers_key: Dataset key for wavenumber values.

    Returns:
        Spectrum loaded from the HDF5 datasets.

    Raises:
        ImportError: If h5py is not installed.
        FileNotFoundError: If path does not exist.
    """
    try:
        import h5py
    except ImportError as e:
        raise DependencyError(
            "h5py is required for HDF5 files. Install with: pip install spectrakit[io]"
        ) from e

    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"HDF5 file not found: {path}")

    validate_file_size(path.stat().st_size, path_name=str(path))

    with h5py.File(path, "r") as f:
        intensities = np.array(f[intensities_key], dtype=np.float64)
        wavenumbers = None
        if wavenumbers_key in f:
            wavenumbers = np.array(f[wavenumbers_key], dtype=np.float64)

        metadata: dict[str, Any] = {}
        for key, value in f.attrs.items():
            metadata[key] = value

    logger.debug("Read HDF5: shape %s from %s", intensities.shape, path.name)

    return Spectrum(
        intensities=intensities,
        wavenumbers=wavenumbers,
        metadata=metadata,
        source_format="hdf5",
        label=path.stem,
    )

spectrakit.io.write_hdf5

write_hdf5(
    spectrum: Spectrum,
    path: str | Path,
    intensities_key: str = "intensities",
    wavenumbers_key: str = "wavenumbers",
) -> None

Write a Spectrum to an HDF5 file.

Parameters:

Name Type Description Default
spectrum Spectrum

Spectrum to save.

required
path str | Path

Output file path.

required
intensities_key str

Dataset key for intensity values.

'intensities'
wavenumbers_key str

Dataset key for wavenumber values.

'wavenumbers'
Source code in src/spectrakit/io/hdf5.py
def write_hdf5(
    spectrum: Spectrum,
    path: str | Path,
    intensities_key: str = "intensities",
    wavenumbers_key: str = "wavenumbers",
) -> None:
    """Write a Spectrum to an HDF5 file.

    Args:
        spectrum: Spectrum to save.
        path: Output file path.
        intensities_key: Dataset key for intensity values.
        wavenumbers_key: Dataset key for wavenumber values.
    """
    try:
        import h5py
    except ImportError as e:
        raise DependencyError(
            "h5py is required for HDF5 files. Install with: pip install spectrakit[io]"
        ) from e

    path = Path(path)
    with h5py.File(path, "w") as f:
        f.create_dataset(intensities_key, data=spectrum.intensities)
        if spectrum.wavenumbers is not None:
            f.create_dataset(wavenumbers_key, data=spectrum.wavenumbers)
        for key, value in spectrum.metadata.items():
            try:
                f.attrs[key] = value
            except TypeError:
                f.attrs[key] = str(value)

    logger.debug("Wrote HDF5: %s", path)