Agent skill

statistics-math

Statistics, probability, linear algebra, and mathematical foundations for data science

View SKILL.md on GitHub Repository

Stars 1,415

Forks 109

Install this agent skill to your Project

npx add-skill https://github.com/foryourhealth111-pixel/Vibe-Skills/tree/main/bundled/skills/statistics-math

SKILL.md

Statistics & Mathematics

Mathematical foundations for data science, machine learning, and statistical analysis.

Quick Start

python

import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

# Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56])
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {np.std(data, ddof=1):.2f}")
print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")

# Hypothesis Testing
sample_a = [23, 45, 67, 32, 45]
sample_b = [56, 78, 45, 67, 89]
t_stat, p_value = stats.ttest_ind(sample_a, sample_b)
print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Reject null hypothesis: significant difference")
else:
    print("Fail to reject null hypothesis")

Core Concepts

1. Probability Distributions

python

import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Normal Distribution
mu, sigma = 100, 15
normal_dist = stats.norm(loc=mu, scale=sigma)
x = np.linspace(50, 150, 100)

# PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}")
print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}")
print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")

# Binomial Distribution (discrete)
n, p = 100, 0.3
binom_dist = stats.binom(n=n, p=p)
print(f"P(X = 30): {binom_dist.pmf(30):.4f}")
print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")

# Poisson Distribution (events per time)
lambda_param = 5
poisson_dist = stats.poisson(mu=lambda_param)
print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")

# Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000)
sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")

2. Hypothesis Testing Framework

python

from scipy import stats
import numpy as np

class HypothesisTest:
    """Framework for statistical hypothesis testing."""

    @staticmethod
    def two_sample_ttest(group_a, group_b, alpha=0.05):
        """Independent samples t-test."""
        t_stat, p_value = stats.ttest_ind(group_a, group_b)
        effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
            (np.var(group_a) + np.var(group_b)) / 2
        )
        return {
            "t_statistic": t_stat,
            "p_value": p_value,
            "significant": p_value < alpha,
            "effect_size_cohens_d": effect_size
        }

    @staticmethod
    def chi_square_test(observed, expected=None, alpha=0.05):
        """Chi-square test for categorical data."""
        if expected is None:
            chi2, p_value, dof, expected = stats.chi2_contingency(observed)
        else:
            chi2, p_value = stats.chisquare(observed, expected)
            dof = len(observed) - 1
        return {
            "chi2_statistic": chi2,
            "p_value": p_value,
            "degrees_of_freedom": dof,
            "significant": p_value < alpha
        }

    @staticmethod
    def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
        """Two-proportion z-test for A/B testing."""
        p_a = conversions_a / total_a
        p_b = conversions_b / total_b
        p_pooled = (conversions_a + conversions_b) / (total_a + total_b)

        se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
        z_stat = (p_a - p_b) / se
        p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))

        return {
            "conversion_a": p_a,
            "conversion_b": p_b,
            "lift": (p_b - p_a) / p_a * 100,
            "z_statistic": z_stat,
            "p_value": p_value,
            "significant": p_value < alpha
        }

# Usage
result = HypothesisTest.ab_test_proportion(
    conversions_a=120, total_a=1000,
    conversions_b=150, total_b=1000
)
print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")

3. Linear Algebra Essentials

python

import numpy as np

# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Basic operations
print("Matrix multiplication:", A @ B)
print("Element-wise:", A * B)
print("Transpose:", A.T)
print("Inverse:", np.linalg.inv(A))
print("Determinant:", np.linalg.det(A))

# Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"Eigenvalues: {eigenvalues}")

# Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A)
print(f"Singular values: {S}")

# Solving linear systems: Ax = b
b = np.array([5, 11])
x = np.linalg.solve(A, b)
print(f"Solution: {x}")

# Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")

4. Regression Analysis

python

import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

# Multiple Linear Regression with statsmodels
X = np.random.randn(100, 3)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100)*0.5

X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()

print(model.summary())
print(f"R-squared: {model.rsquared:.4f}")
print(f"Coefficients: {model.params}")
print(f"P-values: {model.pvalues}")

# Regularization comparison
X_train, y_train = X[:80], y[:80]
X_test, y_test = X[80:], y[80:]

models = {
    "OLS": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

Tools & Technologies

Tool	Purpose	Version (2025)
NumPy	Numerical computing	1.26+
SciPy	Scientific computing	1.12+
pandas	Data manipulation	2.2+
statsmodels	Statistical models	0.14+
scikit-learn	ML algorithms	1.4+

Troubleshooting Guide

Issue	Symptoms	Root Cause	Fix
Low p-value, small effect	Significant but meaningless	Large sample size	Check effect size
High variance	Unstable estimates	Small sample, outliers	More data, robust methods
Multicollinearity	Inflated coefficients	Correlated features	VIF check, remove features
Heteroscedasticity	Invalid inference	Non-constant variance	Weighted least squares

Best Practices

python

# ✅ DO: Check assumptions before testing
from scipy.stats import shapiro
stat, p = shapiro(data)
if p > 0.05:
    print("Data is approximately normal")

# ✅ DO: Use effect sizes, not just p-values
# ✅ DO: Correct for multiple comparisons (Bonferroni)
# ✅ DO: Report confidence intervals

# ❌ DON'T: p-hack by trying many tests
# ❌ DON'T: Confuse correlation with causation
# ❌ DON'T: Ignore sample size requirements

Resources

Khan Academy Statistics
StatQuest with Josh Starmer
"Introduction to Statistical Learning" (ISLR)

Skill Certification Checklist:

Can calculate descriptive statistics
Can perform hypothesis tests (t-test, chi-square)
Can implement A/B testing
Can perform regression analysis
Can use matrix operations for ML

Maintainer

foryourhealth111-pixel Core maintainer

Source details

Full Name: foryourhealth111-pixel/Vibe-Skills
Branch: main
Path in repo: bundled/skills/statistics-math
License: Apache License 2.0
Topics: claude-code anthropic claude agent-skills automation mcp ai-agents cursor developer-tools agentic-coding skills llm codex claude-skills vibe-coding vibecoding opencode ai-skills ai-workflow windsurf

Featured Tools

Join Our Newsletter

Stay updated with the latest AI tools, news, and offers by subscribing to our weekly newsletter.

Recommended Agent Skills

Expand your agent's capabilities with these related and highly-rated skills.

foryourhealth111-pixel/Vibe-Skills

pufferlib

This skill should be used when working with reinforcement learning tasks including high-performance RL training, custom environment development, vectorized parallel simulation, multi-agent systems, or integration with existing RL environments (Gymnasium, PettingZoo, Atari, Procgen, etc.). Use this skill for implementing PPO training, creating PufferEnv environments, optimizing RL performance, or developing policies with CNNs/LSTMs.

1,415 109

Explore

foryourhealth111-pixel/Vibe-Skills

fluidsim

Framework for computational fluid dynamics simulations using Python. Use when running fluid dynamics simulations including Navier-Stokes equations (2D/3D), shallow water equations, stratified flows, or when analyzing turbulence, vortex dynamics, or geophysical flows. Provides pseudospectral methods with FFT, HPC support, and comprehensive output analysis.

1,415 109

Explore

foryourhealth111-pixel/Vibe-Skills

metabolomics-workbench-database

Access NIH Metabolomics Workbench via REST API (4,200+ studies). Query metabolites, RefMet nomenclature, MS/NMR data, m/z searches, study metadata, for metabolomics and biomarker discovery.

1,415 109

Explore

foryourhealth111-pixel/Vibe-Skills

build-error-resolver

Compatibility alias for build-specific error resolution. Use this when VCO routes to build-error-resolver but the upstream agent is unavailable in the current runtime.

1,415 109

Explore

foryourhealth111-pixel/Vibe-Skills

geniml

This skill should be used when working with genomic interval data (BED files) for machine learning tasks. Use for training region embeddings (Region2Vec, BEDspace), single-cell ATAC-seq analysis (scEmbed), building consensus peaks (universes), or any ML-based analysis of genomic regions. Applies to BED file collections, scATAC-seq data, chromatin accessibility datasets, and region-based genomic feature learning.

1,415 109

Explore

foryourhealth111-pixel/Vibe-Skills

zinc-database

Access ZINC (230M+ purchasable compounds). Search by ZINC ID/SMILES, similarity searches, 3D-ready structures for docking, analog discovery, for virtual screening and drug discovery.

1,415 109

Explore

Didn't find tool you were looking for?

Search AI Tools

Install this agent skill to your Project

SKILL.md

Statistics & Mathematics

Quick Start

Core Concepts

1. Probability Distributions

2. Hypothesis Testing Framework

3. Linear Algebra Essentials

4. Regression Analysis

Tools & Technologies

Troubleshooting Guide

Best Practices

Resources

Recommended Agent Skills

pufferlib

fluidsim

metabolomics-workbench-database

build-error-resolver

geniml

zinc-database