Tutorial
Basic usage:
import pandas as pd
from mlsampler import RandomSampler, HybridSampler
Random Sampling
def create_demo_df():
data = {
"is_active": [1, 0, 1, 1, 0],
"is_negative": [0, 1, 0, 0, 1],
"score": [10, 0, 50, 100, 5],
"temperature": [-5.5, 20.0, 36.6, -1.2, 15.8],
"category": ["A", "B", "C", "D", "100"],
"city": ["Tokyo", "Osaka", "Nagoya", "Fukuoka", "Sapporo"],
"cost": [100]*5
}
df = pd.DataFrame(data)
return df
train = create_demo_df()
sampler = RandomSampler.setup(train.values)
print(type(sampler).__qualname__)
sampler.reset_constraints()
sampler.set_constraints('multihot', cols=[0,1])
sampler.set_constraints('random', cols=[2,3], max_used=1)
sampler.set_constraints(
'categories',
cols=[4, 5],
values=train[['category', 'city']].to_numpy(),
strength='soft'
)
result = sampler.sample(20)
print(pd.DataFrame(result, columns=train.columns))
HyperGrid Sampling
df = pd.DataFrame([
[0.1, 0.9, 1, 0, "A"],
[0.5, 0.4, 2, 1, "B"],
[0.9, 0.75, 3, 0, "AB"],
[0.2, 0.8, 4, 1, "O"],
],
columns=['ratio1', 'ratio2', 'rank', 'isOk', 'bloodType']
)
sampler = HyperGridSampler.setup(df.values, random_state=42)
print("\n=== dtype check ===")
for i, f in enumerate(sampler.config.features):
print(f"col {i}: {f.dtype}")
samples = sampler.sample(1000)
print(pd.DataFrame(samples, columns=df.columns))
Notes
RandomSampleris recommended when: - You have multiple constraints to satisfy simultaneously - The relationships between variables are complex or hard to express analyticallyHybridSampleris recommended when:You want better coverage of continuous feature space
You are performing design of experiments (DoE)