Agent skill
statistics-math
Statistics, probability, linear algebra, and mathematical foundations for data science
Install this agent skill to your Project
npx add-skill https://github.com/foryourhealth111-pixel/Vibe-Skills/tree/main/bundled/skills/statistics-math
SKILL.md
Statistics & Mathematics
Mathematical foundations for data science, machine learning, and statistical analysis.
Quick Start
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
# Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56])
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {np.std(data, ddof=1):.2f}")
print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")
# Hypothesis Testing
sample_a = [23, 45, 67, 32, 45]
sample_b = [56, 78, 45, 67, 89]
t_stat, p_value = stats.ttest_ind(sample_a, sample_b)
print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
print("Reject null hypothesis: significant difference")
else:
print("Fail to reject null hypothesis")
Core Concepts
1. Probability Distributions
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Normal Distribution
mu, sigma = 100, 15
normal_dist = stats.norm(loc=mu, scale=sigma)
x = np.linspace(50, 150, 100)
# PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}")
print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}")
print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")
# Binomial Distribution (discrete)
n, p = 100, 0.3
binom_dist = stats.binom(n=n, p=p)
print(f"P(X = 30): {binom_dist.pmf(30):.4f}")
print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")
# Poisson Distribution (events per time)
lambda_param = 5
poisson_dist = stats.poisson(mu=lambda_param)
print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")
# Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000)
sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
2. Hypothesis Testing Framework
from scipy import stats
import numpy as np
class HypothesisTest:
"""Framework for statistical hypothesis testing."""
@staticmethod
def two_sample_ttest(group_a, group_b, alpha=0.05):
"""Independent samples t-test."""
t_stat, p_value = stats.ttest_ind(group_a, group_b)
effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
(np.var(group_a) + np.var(group_b)) / 2
)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < alpha,
"effect_size_cohens_d": effect_size
}
@staticmethod
def chi_square_test(observed, expected=None, alpha=0.05):
"""Chi-square test for categorical data."""
if expected is None:
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
"chi2_statistic": chi2,
"p_value": p_value,
"degrees_of_freedom": dof,
"significant": p_value < alpha
}
@staticmethod
def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
"""Two-proportion z-test for A/B testing."""
p_a = conversions_a / total_a
p_b = conversions_b / total_b
p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
"conversion_a": p_a,
"conversion_b": p_b,
"lift": (p_b - p_a) / p_a * 100,
"z_statistic": z_stat,
"p_value": p_value,
"significant": p_value < alpha
}
# Usage
result = HypothesisTest.ab_test_proportion(
conversions_a=120, total_a=1000,
conversions_b=150, total_b=1000
)
print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
3. Linear Algebra Essentials
import numpy as np
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Basic operations
print("Matrix multiplication:", A @ B)
print("Element-wise:", A * B)
print("Transpose:", A.T)
print("Inverse:", np.linalg.inv(A))
print("Determinant:", np.linalg.det(A))
# Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"Eigenvalues: {eigenvalues}")
# Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A)
print(f"Singular values: {S}")
# Solving linear systems: Ax = b
b = np.array([5, 11])
x = np.linalg.solve(A, b)
print(f"Solution: {x}")
# Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
4. Regression Analysis
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
# Multiple Linear Regression with statsmodels
X = np.random.randn(100, 3)
y = 2*X[:, 0] + 3*X[:, 1] - X[:, 2] + np.random.randn(100)*0.5
X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()
print(model.summary())
print(f"R-squared: {model.rsquared:.4f}")
print(f"Coefficients: {model.params}")
print(f"P-values: {model.pvalues}")
# Regularization comparison
X_train, y_train = X[:80], y[:80]
X_test, y_test = X[80:], y[80:]
models = {
"OLS": LinearRegression(),
"Ridge": Ridge(alpha=1.0),
"Lasso": Lasso(alpha=0.1)
}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
Tools & Technologies
| Tool | Purpose | Version (2025) |
|---|---|---|
| NumPy | Numerical computing | 1.26+ |
| SciPy | Scientific computing | 1.12+ |
| pandas | Data manipulation | 2.2+ |
| statsmodels | Statistical models | 0.14+ |
| scikit-learn | ML algorithms | 1.4+ |
Troubleshooting Guide
| Issue | Symptoms | Root Cause | Fix |
|---|---|---|---|
| Low p-value, small effect | Significant but meaningless | Large sample size | Check effect size |
| High variance | Unstable estimates | Small sample, outliers | More data, robust methods |
| Multicollinearity | Inflated coefficients | Correlated features | VIF check, remove features |
| Heteroscedasticity | Invalid inference | Non-constant variance | Weighted least squares |
Best Practices
# ✅ DO: Check assumptions before testing
from scipy.stats import shapiro
stat, p = shapiro(data)
if p > 0.05:
print("Data is approximately normal")
# ✅ DO: Use effect sizes, not just p-values
# ✅ DO: Correct for multiple comparisons (Bonferroni)
# ✅ DO: Report confidence intervals
# ❌ DON'T: p-hack by trying many tests
# ❌ DON'T: Confuse correlation with causation
# ❌ DON'T: Ignore sample size requirements
Resources
- Khan Academy Statistics
- StatQuest with Josh Starmer
- "Introduction to Statistical Learning" (ISLR)
Skill Certification Checklist:
- Can calculate descriptive statistics
- Can perform hypothesis tests (t-test, chi-square)
- Can implement A/B testing
- Can perform regression analysis
- Can use matrix operations for ML
Recommended Agent Skills
Expand your agent's capabilities with these related and highly-rated skills.
pufferlib
This skill should be used when working with reinforcement learning tasks including high-performance RL training, custom environment development, vectorized parallel simulation, multi-agent systems, or integration with existing RL environments (Gymnasium, PettingZoo, Atari, Procgen, etc.). Use this skill for implementing PPO training, creating PufferEnv environments, optimizing RL performance, or developing policies with CNNs/LSTMs.
fluidsim
Framework for computational fluid dynamics simulations using Python. Use when running fluid dynamics simulations including Navier-Stokes equations (2D/3D), shallow water equations, stratified flows, or when analyzing turbulence, vortex dynamics, or geophysical flows. Provides pseudospectral methods with FFT, HPC support, and comprehensive output analysis.
metabolomics-workbench-database
Access NIH Metabolomics Workbench via REST API (4,200+ studies). Query metabolites, RefMet nomenclature, MS/NMR data, m/z searches, study metadata, for metabolomics and biomarker discovery.
build-error-resolver
Compatibility alias for build-specific error resolution. Use this when VCO routes to build-error-resolver but the upstream agent is unavailable in the current runtime.
geniml
This skill should be used when working with genomic interval data (BED files) for machine learning tasks. Use for training region embeddings (Region2Vec, BEDspace), single-cell ATAC-seq analysis (scEmbed), building consensus peaks (universes), or any ML-based analysis of genomic regions. Applies to BED file collections, scATAC-seq data, chromatin accessibility datasets, and region-based genomic feature learning.
zinc-database
Access ZINC (230M+ purchasable compounds). Search by ZINC ID/SMILES, similarity searches, 3D-ready structures for docking, analog discovery, for virtual screening and drug discovery.
Didn't find tool you were looking for?