Source code for gaiasupdate.metrics

#   Copyright (c) European Space Agency, 2026.
#
#   This file is subject to the terms and conditions defined in file "LICENSE.txt", which
#   is part of this source code package. No part of the package, including
#   this file, may be copied, modified, propagated, or distributed except according to
#   the terms contained in the file "LICENSE.txt".

"""Module providing statistical metrics and an auxiliary class."""

__author__ = "Johannes Sahlmann"

import logging

import numpy as np
from scipy.special import betainc as betai



[docs]
class SolutionStatistic:
    """Convenience class for handling solutions of linear equations systems."""

    _required_parameters = ['n_parameters', 'n_measurements', 'residuals', 'total_variance',
                            'measurement_variance']

    def __init__(self, parameters):
        """Initialise.

        Parameters
        ----------
        parameters : dict

        """
        for key, item in parameters.items():
            setattr(self, key, item)
        for key in self._required_parameters:
            assert hasattr(self, key)

        if hasattr(self, 'model') is False:
            self.model = f'na_{self.n_parameters}p'
        if hasattr(self, 'solver') is False:
            self.solver = 'na'
        if hasattr(self, 'excess_noise') is False:
            self.excess_noise = None
        if hasattr(self, 'n_outliers') is False:
            self.n_outliers = 0

        assert self.n_measurements == len(self.residuals)

        # TODO
        # define these as properties

        # number of degrees of freedom
        self.n_degrees_of_freedom = self.n_measurements - self.n_parameters

        # chi squared
        # from MDB/CU3/AGIS/Source datamodel: These $\chi^2$ values were computed for all accepted
        # observations of the source, without taking into account the source $\tt excessNoise$ (if any).
        # measurement_variance = formal uncertainty + attitude excess noise
        self.chi2 = chi_squared(self.residuals, self.measurement_variance)
        self.chi2_total = chi_squared(self.residuals, self.total_variance)

        # log of the likelihood
        self.ln_likelihood = log_likelihood(self.residuals, self.total_variance)

        # Bayesian Information Criterion (BIC), see e.g. https://en.wikipedia.org/wiki/Bayesian_information_criterion
        self.bic = bayesian_information_criterion(self.n_parameters, self.n_measurements,
                                                  self.ln_likelihood)
        self.aic = akaike_information_criterion(self.n_parameters, self.ln_likelihood)

        # Gaia F2
        self.f2 = gaia_f2(self.chi2, self.n_degrees_of_freedom)
        # compute also when using total variance in chi2
        self.f2_total = gaia_f2(self.chi2_total, self.n_degrees_of_freedom)

        # RMS dispersion of the residuals
        self.residuals_rms = np.std(self.residuals)
        self.residuals_mean = np.mean(self.residuals)

    def __add__(self, other):
        """Compute the fit statistics for a pair of independent results, e.g. 2 x 5p."""
        combined_parameters = {}
        for key in ['n_parameters', 'n_measurements', 'n_outliers']:
            combined_parameters[key] = getattr(self, key) + getattr(other, key)
        for key in ['residuals', 'total_variance', 'measurement_variance']:
            combined_parameters[key] = np.concatenate((getattr(self, key), getattr(other, key)))
        for key in ['excess_noise']:
            combined_parameters[key] = np.array([getattr(self, key), getattr(other, key)])
        return SolutionStatistic(combined_parameters)

    def __gt__(self, other):
        """Decide whether self or other is a statistically better solution.

        For two solutions s1 and s2, s1 > s2 if s1 is the "better solution".

        Several metrics are computed (deltaBIC, F-test, ...), but only one score metric will be
        defined as the decisive one.

        """
        # F-TEST (assumes everything is perfectly Gaussian.)
        # number of data points has to be equal
        if self.n_measurements != other.n_measurements:
            return None

        # probability that the simpler model is better
        f_test_probability = f_test_of_additional_parameter(self.n_measurements, self.n_parameters,
                                                            self.chi2, other.n_parameters,
                                                            other.chi2)
        print(f'F-test prob: {f_test_probability:.1e}')
        f_test_probability_threshold = 1e-4

        if (f_test_probability is None) or (f_test_probability < f_test_probability_threshold):
            return False
        else:
            return True

    def __str__(self):
        """Return string describing the instance."""
        string_0 = f'model={self.model} solver={self.solver}\nn_meas={self.n_measurements} \
        n_param={self.n_parameters} n_free={self.n_degrees_of_freedom} n_outliers={self.n_outliers}'
        string_1 = f'omc_rms={self.residuals_rms:.3f} f2={self.f2:.3f} bic={self.bic:.3f} chi2={self.chi2:.3f}'
        if self.excess_noise is not None:
            if np.ndim(self.excess_noise) == 0:
                string_1 += f'\nexcess_noise={self.excess_noise:.3e}'
            else:
                string_1 += f"\nexcess_noise={['{:.3e}'.format(x) for x in self.excess_noise if x is not None]}"
        return f'SolutionStatistic: {string_0}\n{string_1}'




[docs]
def chi_squared(residuals, variance):
    """Return chi2 fit quality metric.

    Parameters
    ----------
    residuals: narray
        Array of residuals.
    variance: narray
        Array of variances.

    Returns
    -------
    float
        chi2 fit quality metric

    """
    return np.sum(residuals ** 2 / variance)




[docs]
def bayesian_information_criterion(n_parameters, n_measurements, ln_likelihood):
    """Return BIC.

    Parameters
    ----------
    n_parameters: int
        Number of parameters.
    n_measurements: int
        Number of measurements.
    ln_likelihood: float
        Logarithm of the likelihood

    Returns
    -------
    float
        Bayesian information criterion

    """
    bic = n_parameters * np.log(n_measurements) - 2 * ln_likelihood
    return bic




[docs]
def akaike_information_criterion(n_parameters, ln_likelihood):
    """Return AIC.

    Parameters
    ----------
    n_parameters: int
        Number of parameters.
    n_measurements: int
        Number of measurements.

    Returns
    -------
    float
        Akaike information criterion

    """
    aic = 2 * n_parameters - 2 * ln_likelihood
    return aic




[docs]
def log_likelihood_0(variance):
    """Return lnL0 term.

    Parameters
    ----------
    variance: narray
        Array of variances.

    Returns
    -------
    float
        lnL0

    """
    lnL0 = np.sum(np.log(np.sqrt(variance * 2 * np.pi)))
    return lnL0




[docs]
def log_likelihood(residuals, variance):
    """Return the log(likelihood).

    Parameters
    ----------
    residuals: narray
        Array of residuals.
    variance: narray
        Array of variances.

    Returns
    -------
    float
        ln_likelihood

    """
    chi2 = chi_squared(residuals, variance)
    lnL0 = log_likelihood_0(variance)
    ln_likelihood = -0.5 * (chi2) - lnL0
    return ln_likelihood




[docs]
def f_test_of_additional_parameter(n_measurements: int, n_parameters_1: int, chi2_1: float,
                                   n_parameters_2: int, chi2_2: float) -> float:
    """Return F-Test probability that the simpler model is correct.

    Note
    ----
    e.g. n_parameters_1 = 5.  is the number of PPM parameters

    e.g. n_parameters_2 = n_parameters_1 + 7. is the number of PPM + orbital parameters

    Parameters
    ----------
    n_measurements: int
        Number of data points
    n_parameters_1: int
        Number of parameters of the simpler model
    chi2_1: float
        chi^2 corresponding to the simpler model
    n_parameters_2: int
        Number of parameters of the model with more parameters
        n_parameters_2 > p1
    chi2_2: float
        chi^2 corresponding to the model with more parameters

    Returns
    -------
    float
        Probability.

    """
    nu1 = n_parameters_2 - n_parameters_1
    nu2 = n_measurements - n_parameters_2  # degrees of freedom

    # the self.model has to be simpler than the other.model
    if np.any(n_parameters_1 >= n_parameters_2):
        raise ValueError('First model has to be simpler than second model.')

    if np.any(chi2_1 < chi2_2):
        logging.warning('Solution better with less parameters.')
        return 0.

    # F test
    F0 = nu2 / nu1 * (chi2_1 - chi2_2) / chi2_2

    # probability
    prob = betai(0.5 * nu2, 0.5 * nu1, nu2 / (nu2 + F0 * nu1))

    return prob




[docs]
def gaia_f2(chi2, nu):
    """Return F2, a goodness-of-fit statistic.

    Note
    ----
    In the Gaia archive this is called astrometric_gof_al, see the `Archive documentation <https://gea.esac.esa.int/archive/documentation/GDR3/Gaia_archive/chap_datamodel/sec_dm_main_source_catalogue/ssec_dm_gaia_source.html>`__.

    Parameters
    ----------
    chi2 : float
        goodness-of-fit statistic chi^2.
    nu : int
        number of degrees of freedom.

    Returns
    -------
    float
        Gaia f2, which is a goodness of fit statistic.

    """
    return np.sqrt(9 * nu / 2) * ((chi2 / nu) ** (1 / 3) + 2 / (9 * nu) - 1)