⇦ Back

Example Data

Create some fake data for this example:

import numpy as np

# Fake up some data
np.random.seed(20211231)
x = np.linspace(10, 50, 50) + np.random.normal(size=50) * 5
y = np.linspace(10, 150, 50) + np.random.normal(size=50) * 5

Before plotting it, let’s change some display settings:

import matplotlib.pyplot as plt

# Make figures A6 in size
A = 6
plt.rc('figure', figsize=[46.82 * .5**(.5 * A), 33.11 * .5**(.5 * A)])
# Image quality
plt.rc('figure', dpi=141)
# Be able to add Latex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rc('text.latex', preamble=r'\usepackage{textgreek}')

Now we can plot:

# Create plot
ax = plt.axes()
# Scatter plot
ax.scatter(x, y, c='gray', marker='o', edgecolors='k', s=18)
# Set title
ax.set_title('Example Data')
# Adjust the axis limits
left, right = ax.get_xlim()
bottom, top = ax.get_ylim()
ax.set_xlim(0, right)
ax.set_ylim(0, top)

plt.show()

Modelling: Simple Linear Regression

Fit a polynomial of degree 1 (ie a straight line) to the data using polyfit() from NumPy:

# Degree of the fitting polynomial
deg = 1
# Parameters from the fit of the polynomial
p = np.polyfit(x, y, deg)
m = p[0]  # Gradient
c = p[1]  # y-intercept

print(f'The fitted straight line has equation y = {m:.1f}x {c:=+6.1f}')
## The fitted straight line has equation y = 3.2x - 16.7

Get the t-value that can be used in hypothesis testing for statistical significance:

from scipy import stats

# Number of observations
n = y.size
# Number of parameters: equal to the degree of the fitted polynomial (ie the
# number of coefficients) plus 1 (ie the number of constants)
m = p.size
# Degrees of freedom (number of observations - number of parameters)
dof = n - m
# Significance level
alpha = 0.05
# We're using a two-sided test
tails = 2
# The percent-point function (aka the quantile function) of the t-distribution
# gives you the critical t-value that must be met in order to get significance
t_critical = stats.t.ppf(1 - (alpha / tails), dof)

Use the fitted straight line to model the data (ie find what y-values the model predicts the x-values in the raw data give us, as opposed to what the y-values actually are). In other words, now that we’ve fitted a straight line to the data we need to fit the data to the straight line. This can be done with either polyval() or poly1d() (they give the same result):

# Model the data using the parameters of the fitted straight line
y_model = np.polyval(p, x)

# Create the linear (1 degree polynomial) model
model = np.poly1d(p)
# Fit the model
y_model = model(x)

Get the coefficient of determination, R², which measures how well the model (the straight line) fits the data:

# Mean
y_bar = np.mean(y)
# Coefficient of determination, R²
R2 = np.sum((y_model - y_bar)**2) / np.sum((y - y_bar)**2)

print(f'R² = {R2:.2f}')
## R² = 0.86

Estimates of the error:

# Calculate the residuals (the error in the data, according to the model)
resid = y - y_model
# Chi-squared (estimates the error in data)
chi2 = sum((resid / y_model)**2)
# Reduced chi-squared (measures the goodness-of-fit)
chi2_red = chi2 / dof
# Standard deviation of the error
std_err = np.sqrt(sum(resid**2) / dof)

Plotting

# Create plot
plt.scatter(x, y, c='gray', marker='o', edgecolors='k', s=18)
xlim = plt.xlim()
ylim = plt.ylim()
# Line of best fit
plt.plot(np.array(xlim), p[1] + p[0] * np.array(xlim), label=f'Line of Best Fit, R² = {R2:.2f}')
# Fit
x_fitted = np.linspace(xlim[0], xlim[1], 100)
y_fitted = np.polyval(p, x_fitted)
# Confidence interval
ci = t_critical * std_err * np.sqrt(1 / n + (x_fitted - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
plt.fill_between(
    x_fitted, y_fitted + ci, y_fitted - ci, facecolor='#b9cfe7', zorder=0,
    label=r'95\% Confidence Interval'
)
# Prediction Interval
pi = t_critical * std_err * np.sqrt(1 + 1 / n + (x_fitted - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
plt.plot(x_fitted, y_fitted - pi, '--', color='0.5', label=r'95\% Prediction Limits')
plt.plot(x_fitted, y_fitted + pi, '--', color='0.5')
# Title and labels
plt.title('Simple Linear Regression')
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
# Finished
plt.legend(fontsize=8)
plt.xlim(xlim)
plt.ylim(0, ylim[1])
plt.show()

Multiple Groups

Data:

# Fake up some more data
x1 = np.linspace(10, 50, 50) + np.random.normal(size=50) * 5
y1 = np.linspace(30, 130, 50) + np.random.normal(size=50) * 5
x2 = np.linspace(10, 50, 50) + np.random.normal(size=50) * 5
y2 = np.linspace(10, 150, 50) + np.random.normal(size=50) * 5

Simple linear regression:

# Parameters from the fit of the polynomial
p1 = np.polyfit(x1, y1, 1)
p2 = np.polyfit(x2, y2, 1)
# Coefficient of determination, R²
R_sq1 = 1 - (sum((y1 - (p1[0] * x1 + p1[1]))**2) / ((len(y1) - 1) * np.var(y1, ddof=1)))
R_sq2 = 1 - (sum((y2 - (p2[0] * x2 + p2[1]))**2) / ((len(y2) - 1) * np.var(y2, ddof=1)))

Get the critical t-value:

# Number of observations
n = y1.size
# Number of parameters: equal to the degree of the fitted polynomial (ie the
# number of coefficients) plus 1 (ie the number of constants)
m = p1.size
# Degrees of freedom (number of observations - number of parameters)
dof = n - m
# The percent-point function (aka the quantile function) of the t-distribution
# gives you the critical t-value that must be met in order to get significance
t = stats.t.ppf(1 - (0.05 / 2), dof)

Plot:

# Create plot
plt.scatter(x1, y1, c='tab:orange', marker='x')
plt.scatter(x2, y2, c='tab:blue', marker='x')
xlim = plt.xlim()
ylim = plt.ylim()
# Lines of best fit
plt.plot(np.array(xlim), p1[1] + p1[0] * np.array(xlim), c='tab:orange', label=f'Dataset 1: R² = {R_sq1:0.2f}')
plt.plot(np.array(xlim), p2[1] + p2[0] * np.array(xlim), c='tab:blue', label=f'Dataset 2: R² = {R_sq2:0.2f}')
# Fitted model
x_fit = np.linspace(xlim[0], xlim[1], 100)
y_fit1 = np.polyval(p1, x_fit)
y_fit2 = np.polyval(p2, x_fit)
# Text
mu1 = np.mean(y1)
plt.text(10, 70, 'Dataset 1', ha='center', c='tab:orange')
plt.text(10, 65, rf'\textmu\ = {mu1:.1f}', ha='center', c='tab:orange')
mu2 = np.mean(y2)
plt.text(25, 10, 'Dataset 2', ha='center', c='tab:blue')
plt.text(25, 5, rf'\textmu\ = {mu2:.1f}', ha='center', c='tab:blue')
# Title and labels
plt.title('Simple Linear Regressions')
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
# Finished
plt.legend(fontsize=8)
plt.xlim(xlim)
plt.ylim(0, ylim[1])
plt.show()

⇦ Back