Source code for lr_cd.lr_data_generation

# lr_data_generation.py
# author: Sam Fo
# date: 2024-01-10

import numpy as np



[docs]
def generate_data_lr(n, n_features, theta, noise=0.2, random_seed=123):
    """Generate a number of data points base on the theta coefficients.

    Parameters
    ----------
    n : integer
        The number of data points.
    n_features : integer
        The number of features to generate, excluding the intercept.
    theta : ndarray
        The true scalar intercept and coefficient weights vector.
        The first element should always be the intercept.
    noise : float
        The standard deviation of a normal distribution added to the generated target y array as noise.
    random_seed : integer
        Random seed to ensure reproducibility.

    Returns
    -------
    X : ndarray
        Feature data matrix of shape (n_samples, n_features).
    y : ndarray
        Response data matrix of shape (n_samples, 1).

    Examples
    --------
    >>> from lr_cd.lr_data_generation import generate_data_lr
    >>> theta = np.array([4, 3])
    >>> generate_data_lr(n=10, n_features=1, theta=theta)
    """
    np.random.seed(random_seed)

    if not isinstance(n, int):
        raise ValueError('Sample size n must be an integer')

    if not isinstance(n_features, int):
        raise ValueError('Number of features must be an integer')

    if len(theta) != n_features + 1:
        raise ValueError('Number of features does not match with theta')

    if len(theta) < 2:
        raise ValueError('Insufficient number of elements in theta')

    X = np.random.random(size=n * n_features).reshape(n_features, n)
    true_intercept = theta[0]
    true_coeff = theta[1:].reshape(n_features, -1)
    noise = np.random.normal(
        loc=0.0, scale=noise, size=n)
    y = np.sum(X * true_coeff, axis=0) + true_intercept + noise
    return X.T, y.reshape(1, -1).T