Agent skill

data-ai-skills

Master machine learning, data engineering, AI engineering, LLMs, prompt engineering, and MLOps. Build intelligent systems with Python.

Stars 163
Forks 31

Install this agent skill to your Project

npx add-skill https://github.com/majiayu000/claude-skill-registry/tree/main/skills/data/data-ai-skills

SKILL.md

Data & AI Engineering Skills

Python Fundamentals

python
import numpy as np
import pandas as pd
from typing import Optional, List

# NumPy arrays with type hints
def process_array(arr: np.ndarray) -> np.ndarray:
    """Process numpy array with validation."""
    if arr.size == 0:
        raise ValueError("Empty array not allowed")
    return np.clip(arr, 0, 1)

# Pandas DataFrames with error handling
def load_data(path: str) -> pd.DataFrame:
    """Load data with validation."""
    try:
        df = pd.read_csv(path)
        if df.empty:
            raise ValueError(f"Empty dataset: {path}")
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Data file not found: {path}")

Machine Learning with Scikit-Learn

python
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import logging

logger = logging.getLogger(__name__)

def train_model(X, y, n_estimators=100, random_state=42):
    """Train model with logging and validation."""
    logger.info(f"Training with {len(X)} samples")

    # Validate input
    if len(X) != len(y):
        raise ValueError("X and y must have same length")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )

    # Train
    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)

    logger.info(f"Accuracy: {report['accuracy']:.4f}")
    return model, report

Deep Learning with PyTorch

python
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class SimpleNet(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return torch.sigmoid(self.fc2(x))

def train_epoch(model, dataloader, optimizer, loss_fn, device):
    """Train one epoch with gradient clipping."""
    model.train()
    total_loss = 0

    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        output = model(batch_x)
        loss = loss_fn(output, batch_y)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

LLM Applications (2025 Best Practices)

python
from anthropic import Anthropic
import backoff

client = Anthropic()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def query_llm(prompt: str, max_tokens: int = 1024) -> str:
    """Query LLM with retry logic."""
    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=max_tokens,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

# Structured output
def extract_entities(text: str) -> dict:
    """Extract entities with validation."""
    prompt = f"""Extract entities from the text.
    Return JSON with keys: persons, organizations, locations.

    Text: {text}
    """
    response = query_llm(prompt)
    return json.loads(response)

Data Pipeline with Checkpointing

python
import os
from pathlib import Path

def process_with_checkpoint(
    df: pd.DataFrame,
    checkpoint_dir: str,
    step_name: str
) -> pd.DataFrame:
    """Process with checkpoint recovery."""
    checkpoint_path = Path(checkpoint_dir) / f"{step_name}.parquet"

    if checkpoint_path.exists():
        print(f"Loading checkpoint: {step_name}")
        return pd.read_parquet(checkpoint_path)

    # Process
    result = expensive_transform(df)

    # Save checkpoint
    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
    result.to_parquet(checkpoint_path)

    return result

MLOps with MLflow

python
import mlflow
import mlflow.sklearn

def train_with_tracking(X_train, y_train, X_test, y_test, params):
    """Train with experiment tracking."""
    mlflow.set_experiment("model-training")

    with mlflow.start_run():
        # Log parameters
        mlflow.log_params(params)

        # Train
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)

        # Evaluate
        accuracy = model.score(X_test, y_test)
        mlflow.log_metric("accuracy", accuracy)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        return model, accuracy

Model Evaluation Metrics

Task Primary Metric Secondary Metrics
Classification Accuracy, F1 Precision, Recall, AUC
Regression RMSE, MAE R², MAPE
Ranking NDCG MRR, MAP
Clustering Silhouette Davies-Bouldin

Unit Test Template

python
import pytest
import numpy as np
from your_module import train_model, process_array

class TestMLFunctions:
    @pytest.fixture
    def sample_data(self):
        X = np.random.randn(100, 10)
        y = np.random.randint(0, 2, 100)
        return X, y

    def test_train_model(self, sample_data):
        X, y = sample_data
        model, report = train_model(X, y)

        assert model is not None
        assert 'accuracy' in report
        assert 0 <= report['accuracy'] <= 1

    def test_process_array_empty(self):
        with pytest.raises(ValueError, match="Empty array"):
            process_array(np.array([]))

    def test_process_array_clips(self):
        arr = np.array([−1, 0.5, 2])
        result = process_array(arr)
        assert result.min() >= 0
        assert result.max() <= 1

Troubleshooting Guide

Symptom Cause Solution
CUDA OOM Batch too large Reduce batch, use gradient accumulation
Loss NaN Learning rate too high Reduce LR, add gradient clipping
Overfitting Model too complex Add regularization, more data
Slow training I/O bottleneck Use DataLoader workers

Key Concepts Checklist

  • Python basics and NumPy
  • Pandas data manipulation
  • Data visualization (Matplotlib, Seaborn)
  • Supervised learning (classification, regression)
  • Unsupervised learning (clustering)
  • Feature engineering
  • Model evaluation and metrics
  • Hyperparameter tuning
  • Cross-validation
  • Handling imbalanced data
  • Deep learning basics
  • LLM integration
  • Prompt engineering
  • MLOps pipeline setup

Source: https://roadmap.sh Version: 2.0.0 Last Updated: 2025-01-01

Didn't find tool you were looking for?

Be as detailed as possible for better results