Test data is the fuel that drives automation — and bad fuel clogs the engine. Hardcoded data creates conflicts in parallel execution. Shared data creates dependencies between tests. Stale data drifts from production reality. Professional SDETs design test data systems that produce fresh, unique, realistic data on demand — making every test self-sufficient and every parallel run conflict-free.
Test Data Strategies — From Hardcoded to Engineered
The evolution of test data management follows a maturity curve from manual to fully automated.
from datetime import datetime
import random
import string
# ── Strategy 1: Faker — realistic fake data ──
# pip install Faker
# from faker import Faker
# fake = Faker()
# Usage examples:
# fake.name() → "Alice Johnson"
# fake.email() → "ajohnson42@example.com"
# fake.address() → "123 Oak Street, Springfield, IL 62704"
# fake.phone_number() → "+1-555-234-5678"
# fake.credit_card_number() → "4532015112830366"
# fake.date_of_birth() → datetime(1985, 3, 14)
# ── Strategy 2: Builder + Faker combined ──
class UserBuilder:
def __init__(self):
ts = int(datetime.now().timestamp() * 1000)
rand = ''.join(random.choices(string.ascii_lowercase, k=5))
self._data = {
"name": f"TestUser_{rand}",
"email": f"test_{ts}_{rand}@example.com",
"password": "SecurePass1!",
"role": "customer",
"verified": True,
"address": {
"line1": f"{random.randint(1,999)} Test Street",
"city": "London",
"postcode": "SW1A 1AA",
"country": "GB",
},
}
def with_role(self, role):
self._data["role"] = role
return self
def with_address(self, **kwargs):
self._data["address"].update(kwargs)
return self
def unverified(self):
self._data["verified"] = False
return self
def build(self):
return dict(self._data)
# ── Strategy 3: Database seeding via API ──
class TestDataSeeder:
def __init__(self, api_base_url):
self.base_url = api_base_url
def seed_user(self, overrides=None):
user_data = UserBuilder()
if overrides:
for key, value in overrides.items():
if hasattr(user_data, f"with_{key}"):
getattr(user_data, f"with_{key}")(value)
data = user_data.build()
# response = requests.post(f"{self.base_url}/api/test/users", json=data)
# return response.json()
return data
def seed_products(self, count=5):
products = []
for i in range(count):
products.append({
"name": f"Test Product {i+1}",
"price": round(random.uniform(1.99, 99.99), 2),
"category": random.choice(["electronics", "clothing", "books"]),
"in_stock": True,
})
# response = requests.post(f"{self.base_url}/api/test/products", json=products)
return products
def cleanup(self, user_id=None):
# requests.delete(f"{self.base_url}/api/test/cleanup", params={"user": user_id})
pass
# ── Strategy 4: Test data lifecycle ──
DATA_LIFECYCLE = [
{
"phase": "1. Generate",
"how": "Builder + Faker create unique data per test",
"key": "Timestamp or UUID suffix ensures uniqueness across parallel runs",
},
{
"phase": "2. Seed",
"how": "API calls create the data in the application before the test runs",
"key": "Use beforeEach / setup fixtures, not manual insertion",
},
{
"phase": "3. Use",
"how": "Test interacts with the seeded data via UI or API",
"key": "Test references the generated data by its known attributes",
},
{
"phase": "4. Cleanup",
"how": "afterEach / teardown deletes test data to prevent accumulation",
"key": "API-based cleanup or database transaction rollback",
},
]
# ── Anti-patterns ──
ANTI_PATTERNS = [
{
"anti_pattern": "Hardcoded shared credentials",
"problem": "All tests use admin@test.com — parallel tests conflict on session",
"fix": "Each test generates a unique user: test_{timestamp}@example.com",
},
{
"anti_pattern": "Manual database inserts",
"problem": "Test data created via SQL scripts that drift from application logic",
"fix": "Create data via the application's API — validates business rules",
},
{
"anti_pattern": "No cleanup after tests",
"problem": "Test database grows until unique constraints fail or performance degrades",
"fix": "afterEach cleanup via API, or database transaction rollback per test",
},
]
print("Test Data Lifecycle:")
for phase in DATA_LIFECYCLE:
print(f"\n {phase['phase']}")
print(f" How: {phase['how']}")
print(f" Key: {phase['key']}")
f"test_{int(time.time()*1000)}@example.com". This produces a unique email for every test invocation, eliminating “email already exists” errors in parallel execution. The timestamp approach is simpler than UUID and produces human-readable values that are easy to trace in logs and database queries.Common Mistakes
Mistake 1 — All tests sharing the same test user account
❌ Wrong: 200 tests all log in as testuser@test.com — parallel execution causes session conflicts and unpredictable failures.
✅ Correct: Each test creates its own user via UserBuilder().build() and seeds it via API. Zero shared state, zero conflicts.
Mistake 2 — Creating test data via SQL instead of API
❌ Wrong: Inserting user records directly into the database, bypassing validation, hashing, and business rules.
✅ Correct: Creating users via POST /api/users — the application validates, hashes passwords, and sets up related records correctly.