Implementation Guide
Overview
This guide provides step-by-step instructions for implementing the automated business unit onboarding system.
Phase 1: Foundation (Week 1-2)
1.1 Database Schema
-- Create database schema for onboarding system
-- Tenants table
CREATE TABLE tenants (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
status TEXT NOT NULL CHECK (status IN ('active', 'suspended', 'deleted')),
settings JSONB NOT NULL DEFAULT '{}',
permissions JSONB NOT NULL DEFAULT '{}',
rate_limits JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
-- Tenant organization mappings
CREATE TABLE tenant_github_mappings (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
github_org TEXT NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (github_org)
);
CREATE TABLE tenant_tfc_mappings (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
tfc_org TEXT NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (tfc_org)
);
CREATE TABLE tenant_gcp_mappings (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
gcp_org_id TEXT NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (gcp_org_id)
);
-- Onboarding history
CREATE TABLE onboarding_history (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
fingerprint TEXT NOT NULL,
repo_url TEXT NOT NULL,
workspace_id TEXT,
status TEXT NOT NULL CHECK (status IN ('in_progress', 'completed', 'failed', 'rejected', 'validation_failed')),
current_state TEXT,
entities_created JSONB,
error TEXT,
error_state TEXT,
validation_report JSONB,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
completed_at TIMESTAMP,
failed_at TIMESTAMP,
INDEX idx_fingerprint (fingerprint),
INDEX idx_repo_url (repo_url),
INDEX idx_tenant_id (tenant_id)
);
-- Onboarding state (for resumption)
CREATE TABLE onboarding_state (
onboarding_id UUID PRIMARY KEY REFERENCES onboarding_history(id),
current_state TEXT NOT NULL,
context JSONB NOT NULL,
last_checkpoint TIMESTAMP NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
-- Onboarding progress tracking
CREATE TABLE onboarding_progress (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
onboarding_id UUID NOT NULL REFERENCES onboarding_history(id),
state TEXT NOT NULL,
subtask TEXT NOT NULL,
completed_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (onboarding_id, state, subtask)
);
-- Entities table (Backstage catalog)
CREATE TABLE entities (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID NOT NULL REFERENCES tenants(id),
api_version TEXT NOT NULL,
kind TEXT NOT NULL,
metadata JSONB NOT NULL,
spec JSONB NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
INDEX idx_tenant_id (tenant_id),
INDEX idx_kind (kind),
INDEX idx_metadata_name ((metadata->>'name')),
INDEX idx_metadata_namespace ((metadata->>'namespace'))
);
-- Entity relationships
CREATE TABLE entity_relationships (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
source_id UUID NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
target_id UUID NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
type TEXT NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
INDEX idx_source (source_id),
INDEX idx_target (target_id)
);
-- Sync webhooks
CREATE TABLE sync_webhooks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id TEXT NOT NULL,
tenant_id UUID NOT NULL REFERENCES tenants(id),
business_unit TEXT NOT NULL,
webhook_id TEXT NOT NULL,
webhook_url TEXT NOT NULL,
enabled BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (workspace_id)
);
-- Webhook tokens (for verification)
CREATE TABLE webhook_tokens (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id TEXT NOT NULL,
token_hash TEXT NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (workspace_id)
);
-- Sync schedules
CREATE TABLE sync_schedules (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id TEXT NOT NULL,
tenant_id UUID NOT NULL REFERENCES tenants(id),
business_unit TEXT NOT NULL,
schedule TEXT NOT NULL,
enabled BOOLEAN NOT NULL DEFAULT true,
next_run TIMESTAMP NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
UNIQUE (workspace_id)
);
-- Sync history
CREATE TABLE sync_history (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id TEXT NOT NULL,
tenant_id UUID NOT NULL REFERENCES tenants(id),
status TEXT NOT NULL CHECK (status IN ('success', 'failed')),
trigger TEXT NOT NULL,
changes JSONB,
error TEXT,
duration_ms INTEGER,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
INDEX idx_workspace_id (workspace_id),
INDEX idx_created_at (created_at)
);
-- Audit log
CREATE TABLE audit_log (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id UUID REFERENCES tenants(id),
user_id TEXT,
action TEXT NOT NULL,
resource_type TEXT,
resource_id TEXT,
ip_address TEXT,
user_agent TEXT,
details JSONB,
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
INDEX idx_tenant_id (tenant_id),
INDEX idx_timestamp (timestamp),
INDEX idx_action (action)
);
-- Enable Row-Level Security
ALTER TABLE entities ENABLE ROW LEVEL SECURITY;
ALTER TABLE entity_relationships ENABLE ROW LEVEL SECURITY;
ALTER TABLE onboarding_history ENABLE ROW LEVEL SECURITY;
-- RLS policies (tenant isolation)
CREATE POLICY tenant_isolation_entities ON entities
FOR ALL
USING (tenant_id = current_setting('app.current_tenant_id')::uuid);
CREATE POLICY tenant_isolation_onboarding ON onboarding_history
FOR ALL
USING (tenant_id = current_setting('app.current_tenant_id')::uuid);
1.2 Core Dependencies
// package.json
{
"dependencies": {
"@backstage/backend-common": "^0.21.0",
"@backstage/catalog-model": "^1.4.0",
"@octokit/rest": "^20.0.0",
"@hashicorp/js-releases": "^1.5.0",
"knex": "^3.0.0",
"pg": "^8.11.0",
"cron": "^3.1.0",
"express": "^4.18.0",
"jsonwebtoken": "^9.0.0",
"yaml": "^2.3.0",
"zod": "^3.22.0"
},
"devDependencies": {
"@types/node": "^20.0.0",
"jest": "^29.0.0",
"typescript": "^5.0.0"
}
}
1.3 Environment Configuration
# .env
DATABASE_URL=postgresql://user:password@localhost:5432/backstage
BACKSTAGE_URL=https://backstage.example.com
# GitHub
GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxxx
GITHUB_WEBHOOK_SECRET=secret123
# Terraform Cloud
TFC_TOKEN=xxxxxxxxxxxxxxxxxxxxx
TFC_WEBHOOK_SECRET=secret456
# GCP
GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
# JWT
JWT_SECRET=your-jwt-secret
# Slack (optional)
SLACK_WEBHOOK_URL=https://hooks.slack.com/services/xxx
Phase 2: Core Services (Week 3-4)
2.1 Tenant Resolver
# Create service file
touch src/tenancy/tenant-resolver.ts
Implement TenantResolver class from 06-multi-client-isolation.md.
2.2 Detection System
# Create detection services
mkdir -p src/onboarding/detection
touch src/onboarding/detection/{naming-convention,github-topics,backstage-config,tfc-tags,terraform-state,file-structure,composite-detector}.ts
Implement detection algorithms from 04-detection-algorithms.md.
2.3 Validation System
# Create validation services
mkdir -p src/onboarding/validation
touch src/onboarding/validation/{tenant-validator,duplicate-detector,repo-accessibility,workspace-validator,metadata-validator,state-validator,gcp-resource-validator,entity-validator,quality-scorer,orchestrator}.ts
Implement validators from 05-validation-quality-gates.md.
Phase 3: Workflow Engine (Week 5-6)
3.1 State Machine
# Create workflow engine
mkdir -p src/onboarding/workflows
touch src/onboarding/workflows/{state-machine,event-router,state-persistence,state-handlers}.ts
Implement state machine from 03-workflow-state-machine.md.
3.2 State Handlers
// src/onboarding/workflows/state-handlers.ts
export class StateHandlers {
handleReceived = async (context: OnboardingContext): Promise<void> => {
// Implementation
};
handleValidating = async (context: OnboardingContext): Promise<void> => {
// Implementation
};
handleDiscovering = async (context: OnboardingContext): Promise<void> => {
// Implementation
};
// ... implement all state handlers
}
Phase 4: Triggers (Week 7)
4.1 GitHub Webhooks
# Create trigger handlers
mkdir -p src/onboarding/triggers
touch src/onboarding/triggers/{github-webhook,tfc-webhook,polling-discovery,manual-trigger}.ts
Implement triggers from 02-trigger-mechanisms.md.
4.2 Webhook Routes
// src/api/routes/webhook-routes.ts
import express from 'express';
import { GitHubWebhookHandler } from '../../onboarding/triggers/github-webhook';
import { TfcWebhookHandler } from '../../onboarding/triggers/tfc-webhook';
const router = express.Router();
router.post('/webhooks/github', async (req, res) => {
const handler = new GitHubWebhookHandler(eventRouter, tenantResolver);
await handler.handle(req, res);
});
router.post('/webhooks/tfc', async (req, res) => {
const handler = new TfcWebhookHandler(eventRouter, tenantResolver);
await handler.handle(req, res);
});
export default router;
Phase 5: Synchronization (Week 8)
5.1 Sync Services
# Create sync services
mkdir -p src/sync/{setup,handlers,scheduler,service,watchers,drift,monitoring}
touch src/sync/setup/tfc-webhook-setup.ts
touch src/sync/handlers/tfc-webhook-handler.ts
touch src/sync/scheduler/sync-job-scheduler.ts
touch src/sync/service/sync-service.ts
Implement sync from 08-synchronization-setup.md.
5.2 Start Scheduled Jobs
// src/index.ts
import { SyncJobScheduler } from './sync/scheduler/sync-job-scheduler';
async function main() {
// ... other initialization
// Start sync scheduler
const scheduler = new SyncJobScheduler(syncService);
await scheduler.startAll();
console.log('Sync scheduler started');
}
main();
Phase 6: API & UI (Week 9)
6.1 API Routes
// src/api/routes/onboarding-routes.ts
import express from 'express';
import { tenantMiddleware } from '../middleware/tenant-middleware';
import { ManualTriggerAPI } from '../../onboarding/triggers/manual-trigger';
const router = express.Router();
router.use(tenantMiddleware);
const api = new ManualTriggerAPI(eventRouter, auth);
router.post('/onboarding/trigger', api.triggerOnboarding.bind(api));
router.get('/onboarding/:id', api.getOnboardingStatus.bind(api));
export default router;
6.2 Backstage Plugin
# Create Backstage plugin
cd packages
yarn backstage-cli create-plugin
# Plugin name: onboarding
// plugins/onboarding/src/components/OnboardingPage.tsx
import React from 'react';
import { Content, Header, Page } from '@backstage/core-components';
import { ManualOnboardingButton } from './ManualOnboardingButton';
export const OnboardingPage = () => {
return (
<Page themeId="tool">
<Header title="Business Unit Onboarding" />
<Content>
<ManualOnboardingButton />
</Content>
</Page>
);
};
Phase 7: Testing (Week 10)
7.1 Unit Tests
// tests/unit/detection/naming-convention.test.ts
import { NamingConventionDetector } from '../../../src/onboarding/detection/naming-convention';
describe('NamingConventionDetector', () => {
it('should detect business unit from repository name', () => {
const detector = new NamingConventionDetector();
const result = detector.detect('bu-finance-infrastructure');
expect(result.detected).toBe(true);
expect(result.confidence).toBe(0.9);
expect(result.metadata.businessUnit).toBe('finance');
});
it('should reject non-BU repository names', () => {
const detector = new NamingConventionDetector();
const result = detector.detect('my-app-frontend');
expect(result.detected).toBe(false);
expect(result.confidence).toBe(0);
});
});
7.2 Integration Tests
// tests/integration/onboarding-flow.test.ts
describe('Onboarding Flow', () => {
beforeEach(async () => {
await setupTestDatabase();
await createTestTenant();
});
afterEach(async () => {
await teardownTestDatabase();
});
it('should complete full onboarding workflow', async () => {
const context = {
tenantId: 'test-tenant',
repoUrl: 'https://github.com/test-org/bu-finance-infrastructure',
workspaceId: 'ws-test123',
};
const result = await onboardingService.trigger(context);
expect(result.status).toBe('completed');
expect(result.entities).toHaveLength(3); // Domain, System, Component
// Verify entities in database
const entities = await db('entities')
.where('tenant_id', context.tenantId)
.select('*');
expect(entities).toHaveLength(3);
});
});
7.3 E2E Tests
// tests/e2e/github-webhook.test.ts
describe('GitHub Webhook E2E', () => {
it('should trigger onboarding from GitHub repository creation', async () => {
// Simulate GitHub webhook
const payload = {
action: 'created',
repository: {
name: 'bu-finance-infrastructure',
html_url: 'https://github.com/acme-corp/bu-finance-infrastructure',
owner: { login: 'acme-corp' },
},
};
const response = await request(app)
.post('/api/webhooks/github')
.set('X-GitHub-Event', 'repository')
.send(payload);
expect(response.status).toBe(202);
// Wait for async processing
await new Promise(resolve => setTimeout(resolve, 5000));
// Verify onboarding completed
const onboarding = await db('onboarding_history')
.where('repo_url', payload.repository.html_url)
.first();
expect(onboarding.status).toBe('completed');
});
});
Phase 8: Monitoring & Observability (Week 11)
8.1 Metrics
// src/monitoring/metrics.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client';
export const registry = new Registry();
export const onboardingTotal = new Counter({
name: 'onboarding_total',
help: 'Total number of onboarding attempts',
labelNames: ['tenant_id', 'status'],
registers: [registry],
});
export const onboardingDuration = new Histogram({
name: 'onboarding_duration_seconds',
help: 'Onboarding duration in seconds',
labelNames: ['tenant_id', 'state'],
buckets: [1, 5, 10, 30, 60, 120, 300],
registers: [registry],
});
export const entitiesCreated = new Gauge({
name: 'entities_created_total',
help: 'Total number of entities created',
labelNames: ['tenant_id', 'kind'],
registers: [registry],
});
8.2 Metrics Endpoint
// src/api/routes/metrics-routes.ts
import express from 'express';
import { registry } from '../../monitoring/metrics';
const router = express.Router();
router.get('/metrics', async (req, res) => {
res.set('Content-Type', registry.contentType);
res.end(await registry.metrics());
});
export default router;
8.3 Logging
// src/monitoring/logger.ts
import winston from 'winston';
export const logger = winston.createLogger({
level: 'info',
format: winston.format.json(),
defaultMeta: { service: 'onboarding' },
transports: [
new winston.transports.File({ filename: 'error.log', level: 'error' }),
new winston.transports.File({ filename: 'combined.log' }),
],
});
if (process.env.NODE_ENV !== 'production') {
logger.add(new winston.transports.Console({
format: winston.format.simple(),
}));
}
Phase 9: Deployment (Week 12)
9.1 Docker
# Dockerfile
FROM node:20-alpine
WORKDIR /app
COPY package*.json ./
RUN npm ci --only=production
COPY . .
RUN npm run build
EXPOSE 8080
CMD ["node", "dist/index.js"]
9.2 Kubernetes
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: backstage-onboarding
spec:
replicas: 3
selector:
matchLabels:
app: backstage-onboarding
template:
metadata:
labels:
app: backstage-onboarding
spec:
containers:
- name: onboarding
image: backstage-onboarding:latest
ports:
- containerPort: 8080
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: backstage-secrets
key: database-url
- name: GITHUB_TOKEN
valueFrom:
secretKeyRef:
name: github-secrets
key: token
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
9.3 GitHub Actions CI/CD
# .github/workflows/deploy.yml
name: Deploy Onboarding Service
on:
push:
branches: [main]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-node@v3
with:
node-version: '20'
- name: Install dependencies
run: npm ci
- name: Run tests
run: npm test
- name: Build
run: npm run build
- name: Build Docker image
run: docker build -t backstage-onboarding:${{ github.sha }} .
- name: Push to registry
run: docker push backstage-onboarding:${{ github.sha }}
- name: Deploy to Kubernetes
run: kubectl set image deployment/backstage-onboarding onboarding=backstage-onboarding:${{ github.sha }}
Checklist
Pre-Launch
- Database schema created and migrated
- All core services implemented
- State machine tested
- Triggers configured (GitHub, TFC)
- Sync jobs scheduled
- API routes secured
- UI plugin installed
- Unit tests passing (> 90% coverage)
- Integration tests passing
- E2E tests passing
- Metrics endpoint working
- Logging configured
- Error alerting configured
Launch
- Deploy to staging environment
- Test with real tenants
- Monitor metrics and logs
- Verify webhooks working
- Verify sync jobs running
- Check catalog entities created correctly
- Deploy to production
- Monitor production metrics
Post-Launch
- Document runbooks
- Train support team
- Create admin dashboards
- Set up alerting rules
- Schedule regular drift detection
- Plan backfill for existing business units
Support & Troubleshooting
Common Issues
Issue: Onboarding stuck in DISCOVERING state
Solution: Check TFC workspace has successful run. Verify API credentials.
Issue: Cross-tenant violation
Solution: Verify tenant mappings in tenant_github_mappings table.
Issue: Duplicate entities
Solution: Check fingerprint logic in OnboardingFingerprint.
Debug Commands
# Check onboarding status
psql -c "SELECT id, status, current_state FROM onboarding_history ORDER BY created_at DESC LIMIT 10;"
# View sync history
psql -c "SELECT workspace_id, status, created_at FROM sync_history ORDER BY created_at DESC LIMIT 10;"
# Check webhook deliveries
curl -X GET "https://app.terraform.io/api/v2/workspaces/{workspace_id}/notification-configurations" \
-H "Authorization: Bearer $TFC_TOKEN"
Next Steps
- Review 01-onboarding-system-overview.md for architecture
- Follow phase-by-phase implementation
- Test thoroughly before production deployment
- Monitor metrics and logs continuously
- Iterate based on feedback
Success Metrics
- Onboarding time: < 5 minutes from Terraform apply
- Success rate: > 95%
- Sync latency: < 10 seconds for webhooks
- Drift detection: Daily reconciliation
- Catalog accuracy: 100% (no stale entities)