diff --git a/lxc_autoscale_ml/api/README.md b/lxc_autoscale_ml/api/README.md deleted file mode 100644 index e63cbe0..0000000 --- a/lxc_autoscale_ml/api/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# LXC AutoScale API - -- [Documentation](https://github.com/fabriziosalmi/proxmox-lxc-autoscale/blob/main/docs/lxc_autoscale_api/README.md) diff --git a/lxc_autoscale_ml/api/cloning_management.py b/lxc_autoscale_ml/api/cloning_management.py deleted file mode 100644 index d943091..0000000 --- a/lxc_autoscale_ml/api/cloning_management.py +++ /dev/null @@ -1,18 +0,0 @@ -from lxc_management import LXCManager -from utils import create_response, handle_error - -def create_clone(vm_id, new_vm_id, new_vm_name): - try: - lxc_manager = LXCManager() - result = lxc_manager.clone(vm_id, new_vm_id, new_vm_name) - return create_response(data=result, message=f"Container {vm_id} cloned as '{new_vm_name}' with ID {new_vm_id}") - except Exception as e: - return handle_error(e) - -def delete_clone(vm_id): - try: - lxc_manager = LXCManager() - result = lxc_manager.delete_container(vm_id) - return create_response(data=result, message=f"Container {vm_id} successfully deleted") - except Exception as e: - return handle_error(e) diff --git a/lxc_autoscale_ml/api/config.py b/lxc_autoscale_ml/api/config.py deleted file mode 100644 index 902bdb7..0000000 --- a/lxc_autoscale_ml/api/config.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import yaml -from flask import Flask - -def load_config(config_file='/etc/lxc_autoscale/lxc_autoscale_api.yaml'): - with open(config_file, 'r') as file: - config = yaml.safe_load(file) - return config - -def create_app(config=None): - app = Flask(__name__) - - if config is None: - config = load_config() - - app.config['LXC_NODE'] = config['lxc']['node'] - app.config['DEFAULT_STORAGE'] = config['lxc']['default_storage'] - app.config['TIMEOUT'] = config['lxc']['timeout_seconds'] - - # Load the rate limiting configuration - app.config['RATE_LIMITING'] = config.get('rate_limiting', {}) - - # Flask settings - app.secret_key = os.urandom(24) - app.config['DEBUG'] = False - app.config['LOGGING'] = config['logging'] - app.config['ERROR_HANDLING'] = config['error_handling'] - - return app diff --git a/lxc_autoscale_ml/api/error_handling.py b/lxc_autoscale_ml/api/error_handling.py deleted file mode 100644 index 2f04602..0000000 --- a/lxc_autoscale_ml/api/error_handling.py +++ /dev/null @@ -1,42 +0,0 @@ -from flask import jsonify, current_app -import logging - -# Centralized error handler -def handle_error(exception, status_code=500): - error_handling_config = current_app.config.get('ERROR_HANDLING', {}) - - # Log the error if logging is enabled - if error_handling_config.get('log_errors', True): - logging.error(f"Error occurred: {str(exception)}") - - # Optionally notify on critical errors - if status_code >= 500 and error_handling_config.get('notify_on_critical_errors', False): - notify_on_critical_error(exception) - - # Optionally show stack traces (in non-production environments) - if error_handling_config.get('show_stack_traces', False): - response = { - "status": "error", - "message": str(exception), - "stack_trace": repr(exception) - } - else: - response = { - "status": "error", - "message": "An internal error occurred. Please contact support if the issue persists." - } - - return jsonify(response), status_code - -# Function to notify admins of critical errors (stub function) -def notify_on_critical_error(exception): - error_handling_config = current_app.config['ERROR_HANDLING'] - recipients = error_handling_config.get('notification_recipients', []) - - # Log the notification action - logging.info(f"Notifying recipients of critical error: {', '.join(recipients)}") - - # Implement actual notification logic (e.g., send an email or post to a Slack channel) - # This is a placeholder for demonstration purposes - for recipient in recipients: - logging.info(f"Notified {recipient} of the error: {str(exception)}") diff --git a/lxc_autoscale_ml/api/health_check.py b/lxc_autoscale_ml/api/health_check.py deleted file mode 100644 index 6f41803..0000000 --- a/lxc_autoscale_ml/api/health_check.py +++ /dev/null @@ -1,16 +0,0 @@ -from flask import jsonify - -def health_check(): - try: - # Perform health checks here, e.g., checking database connectivity, etc. - # For now, we just return a simple healthy status. - status = { - "status": "healthy", - "checks": { - "database": "connected", # Replace with actual database check - "lxc_commands": "available" # Replace with actual check - } - } - return jsonify(status), 200 - except Exception as e: - return jsonify({"status": "unhealthy", "error": str(e)}), 500 diff --git a/lxc_autoscale_ml/api/lxc_autoscale_api.py b/lxc_autoscale_ml/api/lxc_autoscale_api.py deleted file mode 100644 index 82ed554..0000000 --- a/lxc_autoscale_ml/api/lxc_autoscale_api.py +++ /dev/null @@ -1,297 +0,0 @@ -import logging -from flask import Flask, jsonify, request -from config import create_app -from scaling import scale_cpu, scale_ram, resize_storage -from snapshot_management import create_snapshot, list_snapshots, rollback_snapshot -from cloning_management import create_clone, delete_clone -from resource_checking import check_vm_status, check_node_status, check_cluster_status -from health_check import health_check -from rate_limiting import rate_limit -from error_handling import handle_error -from lxc_management import LXCManager -from utils import create_response - -# Setup logging -log_file = "/var/log/lxc_autoscale_api.log" -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(message)s", -) - -app = create_app() - -# Home route - -@app.route('/', methods=['GET']) -def home(): - documentation = """ - - - - - - AutoScaleAPI Documentation - - - - - - -
-

AutoScaleAPI Documentation

-

Welcome to the AutoScaleAPI. Below is a list of available API routes with descriptions and example usage.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
EndpointMethodsDescriptionExample
/scale/coresPOSTSet the exact number of CPU cores for an LXC container.curl -X POST http://proxmox:5000/scale/cores -H "Content-Type: application/json" -d '{"vm_id": 104, "cores": 4}'
/scale/ramPOSTSet the exact amount of RAM for an LXC container.curl -X POST http://proxmox:5000/scale/ram -H "Content-Type: application/json" -d '{"vm_id": 104, "memory": 4096}'
/scale/storage/increasePOSTIncrease the storage size of an LXC container's root filesystem.curl -X POST http://proxmox:5000/scale/storage/increase -H "Content-Type: application/json" -d '{"vm_id": 104, "disk_size": 2}'
/snapshot/createPOSTCreate a snapshot for an LXC container.curl -X POST http://proxmox:5000/snapshot/create -H "Content-Type: application/json" -d '{"vm_id": 104, "snapshot_name": "my_snapshot"}'
/snapshot/listGETList all snapshots for an LXC container.curl -X GET "http://proxmox:5000/snapshot/list?vm_id=104"
/snapshot/rollbackPOSTRollback to a specific snapshot.curl -X POST http://proxmox:5000/snapshot/rollback -H "Content-Type: application/json" -d '{"vm_id": 104, "snapshot_name": "my_snapshot"}'
/clone/createPOSTClone an LXC container.curl -X POST http://proxmox:5000/clone/create -H "Content-Type: application/json" -d '{"vm_id": 104, "new_vm_id": 105, "new_vm_name": "cloned_container"}'
/clone/deleteDELETEDelete a cloned LXC container.curl -X DELETE http://proxmox:5000/clone/delete -H "Content-Type: application/json" -d '{"vm_id": 105}'
/resource/vm/statusGETCheck the resource allocation and usage for an LXC container.curl -X GET "http://proxmox:5000/resource/vm/status?vm_id=104"
/resource/node/statusGETCheck the resource usage of a specific node.curl -X GET "http://proxmox:5000/resource/node/status?node_name=proxmox4"
/health/checkGETPerform a health check on the API server.curl -X GET http://proxmox:5000/health/check
/routesGETList all available routes.curl -X GET http://proxmox:5000/routes
-
- - - - - - - """.replace("", "proxmox") - - return documentation - - -# Define routes - -@app.route('/scale/cores', methods=['POST']) -@rate_limit -def set_cores(): - data = request.json - vm_id = data['vm_id'] - cores = data['cores'] - logging.info(f"Setting {cores} cores for VM {vm_id}") - return scale_cpu(vm_id, cores) - -@app.route('/scale/ram', methods=['POST']) -@rate_limit -def set_ram(): - data = request.json - vm_id = data['vm_id'] - memory = data['memory'] - logging.info(f"Setting {memory} MB RAM for VM {vm_id}") - return scale_ram(vm_id, memory) - -@app.route('/scale/storage/increase', methods=['POST']) -@rate_limit -def increase_storage(): - data = request.json - vm_id = data['vm_id'] - disk_size = data['disk_size'] - logging.info(f"Increasing storage by {disk_size} GB for VM {vm_id}") - return resize_storage(vm_id, disk_size) - -@app.route('/snapshot/create', methods=['POST']) -@rate_limit -def create_snapshot_route(): - data = request.json - vm_id = data['vm_id'] - snapshot_name = data['snapshot_name'] - logging.info(f"Creating snapshot '{snapshot_name}' for VM {vm_id}") - return create_snapshot(vm_id, snapshot_name) - -@app.route('/snapshot/list', methods=['GET']) -@rate_limit -def list_snapshots_route(): - vm_id = request.args.get('vm_id') - return list_snapshots(vm_id) - -@app.route('/snapshot/rollback', methods=['POST']) -@rate_limit -def rollback_snapshot_route(): - data = request.json - vm_id = data['vm_id'] - snapshot_name = data['snapshot_name'] - return rollback_snapshot(vm_id, snapshot_name) - -@app.route('/clone/create', methods=['POST']) -@rate_limit -def create_clone(): - data = request.json - vm_id = data['vm_id'] - new_vm_id = data['new_vm_id'] - new_vm_name = data['new_vm_name'].replace('_', '-') # Replace underscores with hyphens - snapshot_name = f"snapshot-{new_vm_id}" # Create a unique snapshot name - - try: - lxc_manager = LXCManager() - - # Step 1: Create a snapshot - lxc_manager.create_snapshot(vm_id, snapshot_name) - logging.info(f"Creating snapshot of VM {vm_id} with name '{snapshot_name}'") - - # Step 2: Clone the container from the snapshot - lxc_manager.clone_container(vm_id, new_vm_id, new_vm_name, snapshot_name) - logging.info(f"Cloning VM {vm_id} to new VM {new_vm_id} with name '{new_vm_name}'") - - # Step 3: Start the cloned container - lxc_manager.start_container(new_vm_id) - logging.info(f"Starting clone VM {new_vm_id} with name '{new_vm_name}'") - - # Step 4: Delete the snapshot - lxc_manager.delete_snapshot(vm_id, snapshot_name) - logging.info(f"Deleting snapshot of VM {vm_id} with name '{snapshot_name}'") - - return create_response( - data=f"Container {new_vm_id} cloned from {vm_id} and started successfully. Snapshot {snapshot_name} removed.", - message="Clone operation completed successfully.", - status_code=200 - ) - except Exception as e: - return handle_error(e) - - -@app.route('/clone/delete', methods=['DELETE']) -@rate_limit -def delete_clone_route(): - vm_id = request.json['vm_id'] - - try: - lxc_manager = LXCManager() - - # Step 1: Stop the container - lxc_manager.stop_container(vm_id) - logging.info(f"Stopping VM {new_vm_id}") - - # Step 2: Destroy the container - lxc_manager.destroy_container(vm_id) - logging.info(f"Destroying VM {new_vm_id}") - - return create_response( - data=f"Container {vm_id} stopped and destroyed successfully.", - message="Delete operation completed successfully.", - status_code=200 - ) - except Exception as e: - return handle_error(e) - - -@app.route('/resource/vm/status', methods=['GET']) -@rate_limit -def check_vm_status_route(): - vm_id = request.args.get('vm_id') - logging.info(f"Checking status for VM {vm_id}") - return check_vm_status(vm_id) - -@app.route('/resource/node/status', methods=['GET']) -@rate_limit -def check_node_status_route(): - node_name = request.args.get('node_name') - logging.info(f"Checking status for node {node_name}") - return check_node_status(node_name) - -@app.route('/health/check', methods=['GET']) -def health_check_route(): - logging.info("Health check endpoint accessed") - return health_check() - -@app.route('/routes', methods=['GET']) -def list_routes(): - routes = [] - for rule in app.url_map.iter_rules(): - methods = ','.join(rule.methods) - route = { - "endpoint": rule.endpoint, - "methods": methods, - "url": str(rule) - } - routes.append(route) - logging.info("Routes list endpoint accessed") - return jsonify(routes), 200 - - -# Setup logging -logging.basicConfig(level=logging.INFO) - -if __name__ == "__main__": - app.run(host="0.0.0.0", port=5000) diff --git a/lxc_autoscale_ml/api/lxc_autoscale_api.service b/lxc_autoscale_ml/api/lxc_autoscale_api.service deleted file mode 100644 index b949267..0000000 --- a/lxc_autoscale_ml/api/lxc_autoscale_api.service +++ /dev/null @@ -1,18 +0,0 @@ -[Unit] -Description=LXC AutoScale API -Documentation=https://github.com/fabriziosalmi/proxmox-lxc-autoscale -After=network.target - -[Service] -User=root -Group=root -Environment="PATH=/usr/local/bin:/usr/bin:/bin:/usr/sbin" -WorkingDirectory=/usr/local/bin/lxc_autoscale_api/ -ExecStart=/usr/bin/python3 /usr/local/bin/lxc_autoscale_api/lxc_autoscale_api.py - -# Restart on failure -Restart=on-failure -RestartSec=5 - -[Install] -WantedBy=multi-user.target diff --git a/lxc_autoscale_ml/api/lxc_autoscale_api.yaml b/lxc_autoscale_ml/api/lxc_autoscale_api.yaml deleted file mode 100644 index a80799b..0000000 --- a/lxc_autoscale_ml/api/lxc_autoscale_api.yaml +++ /dev/null @@ -1,87 +0,0 @@ -lxc: - # Name or IP address of the Proxmox node where LXC containers are managed. - node: "proxmox" # Replace with your Proxmox node name or IP. Note: if node name or hostname won't work use IP address instead, should work as expected. - - # Whether to verify SSL certificates. Set to false for local commands. - verify_ssl: false # Not applicable since we are using local commands - - # Default storage location for LXC containers. - default_storage: "local-lvm" # Default storage location - - # Timeout for operations in seconds. - timeout_seconds: 10 - - # Maximum number of retry attempts for operations. - max_retries: 3 - -rate_limiting: - # Maximum number of requests allowed per minute to prevent abuse. - max_requests_per_minute: 60 - -logging: - # Logging level that controls the verbosity of logs. Options include "DEBUG", "INFO", "WARNING", "ERROR", and "CRITICAL". - level: "INFO" - - # Whether to enable log rotation to manage log file size. - rotate: true - - # Maximum size of the log file in megabytes before it is rotated. - max_log_size_mb: 100 - - # Number of backup log files to keep. Older files are deleted when new ones are created. - backup_count: 5 - - # Whether to use structured logging (e.g., JSON format). - structured_logging: false - - # Format for log messages. Uses Python's logging format string. - log_format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" # Log format - - # Path to the file where access logs are written. - access_log_file: "access.log" # Separate access log for HTTP requests - - # Path to the file where error logs are written. - error_log_file: "error.log" # Separate error log for exceptions and errors - -error_handling: - # Whether to display stack traces in error messages. - show_stack_traces: false - - # Whether to log errors to the error log file. - log_errors: true - - # Whether to send notifications for critical errors. - notify_on_critical_errors: false - - # List of email addresses to notify in case of critical errors. - notification_recipients: - # Email address to receive notifications. - - "your-email@inbox.com" - -gunicorn: - # Number of worker processes to handle incoming requests. - workers: 2 - - # Timeout for worker processes in seconds. Defines how long a worker can handle a request before being killed. - timeout_seconds: 120 - - # Log level for Gunicorn, which affects the verbosity of logs. - log_level: "info" - - # Path to the file where Gunicorn access logs are written. - access_log_file: "/var/log/lxc_autoscale_api_access.log" - - # Path to the file where Gunicorn error logs are written. - error_log_file: "/var/log/lxc_autoscale_api_error.log" - - # Whether to preload the application before forking worker processes. Reduces startup time but uses more memory. - preload_app: true - - # Time to wait before forcefully killing workers during a graceful restart. - graceful_timeout_seconds: 30 - - # Number of requests a worker will handle before being restarted. Helps to prevent memory leaks. - max_requests: 500 - - # Adds a random jitter to `max_requests` to avoid all workers restarting at the same time. - max_requests_jitter: 50 diff --git a/lxc_autoscale_ml/api/lxc_management.py b/lxc_autoscale_ml/api/lxc_management.py deleted file mode 100644 index 0549148..0000000 --- a/lxc_autoscale_ml/api/lxc_management.py +++ /dev/null @@ -1,114 +0,0 @@ -import subprocess -import shlex -from flask import current_app -import logging - -class LXCManager: - def __init__(self, node=None): - self.node = node or current_app.config['LXC_NODE'] - self.timeout = current_app.config['TIMEOUT'] - - def _run_command(self, command): - try: - logging.info(f"Running command: {command}") - result = subprocess.run( - shlex.split(command), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - timeout=self.timeout, - universal_newlines=True - ) - result.check_returncode() - return result.stdout.strip() - except subprocess.CalledProcessError as e: - logging.error(f"Command failed: {e.stderr}") - raise Exception(f"Command failed: {e.stderr}") - except subprocess.TimeoutExpired: - logging.error("Command timed out") - raise Exception("Command timed out") - - - def stop_container(self, vm_id): - command = f"pct stop {vm_id}" - return self._run_command(command) - - def destroy_container(self, vm_id): - command = f"pct destroy {vm_id}" - return self._run_command(command) - - def create_temporary_snapshot(self, vm_id): - snapshot_name = f"migrate-snapshot-{vm_id}" - command = f"pct snapshot {vm_id} {snapshot_name}" - self._run_command(command) - return snapshot_name - - def delete_snapshot(self, vm_id, snapshot_name): - command = f"pct delsnapshot {vm_id} {snapshot_name}" - self._run_command(command) - - def migrate_container(self, vm_id, target_node): - # Check for non-migratable snapshots or create a new one for migration - snapshot_name = self.create_temporary_snapshot(vm_id) - - try: - command = f"pct migrate {vm_id} {target_node}" - self._run_command(command) - finally: - # Clean up the temporary snapshot after migration - self.delete_snapshot(vm_id, snapshot_name) - - def scale_cpu(self, vm_id, cores): - command = f"pct set {vm_id} -cores {cores}" - return self._run_command(command) - - def scale_ram(self, vm_id, memory): - command = f"pct set {vm_id} -memory {memory}" - return self._run_command(command) - - def get_current_disk_size(self, vm_id): - # Retrieve the current size of the root filesystem in GB - command = f"pct config {vm_id}" - output = self._run_command(command) - for line in output.splitlines(): - if line.startswith("rootfs:"): - current_size = line.split(",")[1].replace("size=", "").replace("G", "").strip() - return int(current_size) - raise Exception("Failed to retrieve current disk size") - - def resize_storage(self, vm_id, disk_size): - current_size = self.get_current_disk_size(vm_id) - new_size = current_size + disk_size - command = f"pct resize {vm_id} rootfs {new_size}G" - return self._run_command(command) - - def create_snapshot(self, vm_id, snapshot_name): - command = f"pct snapshot {vm_id} {snapshot_name}" - return self._run_command(command) - - def clone_container(self, vm_id, new_vm_id, new_vm_name, snapshot_name): - command = f"pct clone {vm_id} {new_vm_id} --hostname {new_vm_name} --snapname {snapshot_name}" - return self._run_command(command) - - def start_container(self, vm_id): - command = f"pct start {vm_id}" - return self._run_command(command) - - def list_snapshots(self, vm_id): - command = f"pct listsnapshot {vm_id}" - return self._run_command(command) - - def rollback_snapshot(self, vm_id, snapshot_name): - command = f"pct rollback {vm_id} {snapshot_name}" - return self._run_command(command) - - def clone(self, vm_id, new_vm_id, new_vm_name): - command = f"pct clone {vm_id} {new_vm_id} --hostname {new_vm_name} --full" - return self._run_command(command) - - def delete_container(self, vm_id): - command = f"pct stop {vm_id} && pct destroy {vm_id}" - return self._run_command(command) - - def migrate(self, vm_id, target_node): - command = f"pct migrate {vm_id} {target_node}" - return self._run_command(command) diff --git a/lxc_autoscale_ml/api/rate_limiting.py b/lxc_autoscale_ml/api/rate_limiting.py deleted file mode 100644 index c8c6062..0000000 --- a/lxc_autoscale_ml/api/rate_limiting.py +++ /dev/null @@ -1,30 +0,0 @@ -from functools import wraps -from flask import request, jsonify, current_app -import time - -# Simple in-memory rate limiting for demonstration purposes -rate_limit_data = {} - -def rate_limit(f): - @wraps(f) - def decorated_function(*args, **kwargs): - client_ip = request.remote_addr - rate_limiting_config = current_app.config['RATE_LIMITING'] - - if client_ip not in rate_limit_data: - rate_limit_data[client_ip] = [] - - access_times = rate_limit_data[client_ip] - current_time = time.time() - - # Filter out access times that are older than one minute - access_times = [t for t in access_times if current_time - t < 60] - rate_limit_data[client_ip] = access_times - - if len(access_times) >= rate_limiting_config['max_requests_per_minute']: - return jsonify({"error": "Rate limit exceeded. Please try again later."}), 429 - - access_times.append(current_time) - return f(*args, **kwargs) - - return decorated_function diff --git a/lxc_autoscale_ml/api/resource_checking.py b/lxc_autoscale_ml/api/resource_checking.py deleted file mode 100644 index a5650b1..0000000 --- a/lxc_autoscale_ml/api/resource_checking.py +++ /dev/null @@ -1,23 +0,0 @@ -import subprocess -from utils import create_response, handle_error - -def check_vm_status(vm_id): - try: - result = subprocess.run(f"pct status {vm_id}", shell=True, capture_output=True, text=True) - return create_response(data=result.stdout.strip(), message=f"Resource status retrieved for container {vm_id}") - except Exception as e: - return handle_error(e) - -def check_node_status(node_name): - try: - result = subprocess.run(f"pvesh get /nodes/{node_name}/status", shell=True, capture_output=True, text=True) - return create_response(data=result.stdout.strip(), message=f"Resource status retrieved for node '{node_name}'") - except Exception as e: - return handle_error(e) - -def check_cluster_status(): - try: - result = subprocess.run("pvecm status", shell=True, capture_output=True, text=True) - return create_response(data=result.stdout.strip(), message="Cluster resource status retrieved successfully") - except Exception as e: - return handle_error(e) diff --git a/lxc_autoscale_ml/api/scaling.py b/lxc_autoscale_ml/api/scaling.py deleted file mode 100644 index b0b1c39..0000000 --- a/lxc_autoscale_ml/api/scaling.py +++ /dev/null @@ -1,26 +0,0 @@ -from lxc_management import LXCManager -from utils import create_response, handle_error - -def scale_cpu(vm_id, cores): - try: - lxc_manager = LXCManager() - result = lxc_manager.scale_cpu(vm_id, cores) - return create_response(data=result, message=f"CPU cores set to {cores} for container {vm_id}") - except Exception as e: - return handle_error(e) - -def scale_ram(vm_id, memory): - try: - lxc_manager = LXCManager() - result = lxc_manager.scale_ram(vm_id, memory) - return create_response(data=result, message=f"RAM set to {memory} MB for container {vm_id}") - except Exception as e: - return handle_error(e) - -def resize_storage(vm_id, disk_size): - try: - lxc_manager = LXCManager() - result = lxc_manager.resize_storage(vm_id, disk_size) - return create_response(data=result, message=f"Storage increased by {disk_size} GB for container {vm_id}") - except Exception as e: - return handle_error(e) diff --git a/lxc_autoscale_ml/api/snapshot_management.py b/lxc_autoscale_ml/api/snapshot_management.py deleted file mode 100644 index 0a96263..0000000 --- a/lxc_autoscale_ml/api/snapshot_management.py +++ /dev/null @@ -1,26 +0,0 @@ -from lxc_management import LXCManager -from utils import create_response, handle_error - -def create_snapshot(vm_id, snapshot_name): - try: - lxc_manager = LXCManager() - result = lxc_manager.create_snapshot(vm_id, snapshot_name) - return create_response(data=result, message=f"Snapshot '{snapshot_name}' created for container {vm_id}") - except Exception as e: - return handle_error(e) - -def list_snapshots(vm_id): - try: - lxc_manager = LXCManager() - result = lxc_manager.list_snapshots(vm_id) - return create_response(data=result, message=f"Snapshots listed for container {vm_id}") - except Exception as e: - return handle_error(e) - -def rollback_snapshot(vm_id, snapshot_name): - try: - lxc_manager = LXCManager() - result = lxc_manager.rollback_snapshot(vm_id, snapshot_name) - return create_response(data=result, message=f"Container {vm_id} rolled back to snapshot '{snapshot_name}'") - except Exception as e: - return handle_error(e) diff --git a/lxc_autoscale_ml/api/utils.py b/lxc_autoscale_ml/api/utils.py deleted file mode 100644 index 53e647e..0000000 --- a/lxc_autoscale_ml/api/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -from flask import jsonify, current_app -import logging - -def create_response(data=None, message=None, status_code=200): - response = { - 'status': 'success' if status_code < 400 else 'error', - 'message': message, - 'data': data - } - return jsonify(response), status_code - -def handle_error(exception, status_code=500): - error_handling_config = current_app.config.get('ERROR_HANDLING', {}) - if error_handling_config.get('log_errors', True): - logging.error(f"Error occurred: {str(exception)}") - - response = { - "status": "error", - "message": str(exception) if error_handling_config.get('show_stack_traces', False) else "An internal error occurred." - } - return jsonify(response), status_code diff --git a/lxc_autoscale_ml/install_api.sh b/lxc_autoscale_ml/install_api.sh deleted file mode 100644 index c621a0f..0000000 --- a/lxc_autoscale_ml/install_api.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Ensure the script is run as root -if [ "$(id -u)" -ne 0 ]; then - echo "๐Ÿšซ This script must be run as root. Please use sudo or run as root." - exit 1 -fi - -# Step 1: Create the necessary directory -echo "๐Ÿ“ Creating directory /usr/local/bin/lxc_autoscale_api..." -mkdir -p /usr/local/bin/lxc_autoscale_api -mkdir -p /etc/lxc_autoscale - -# Flask testing server (gunicorn setup will be release later on) - -# Step 2: Install necessary packages -echo "๐Ÿ“ฆ Installing required packages..." -apt update -apt install git python3-flask python3-requests -y - -# Step 3: Clone the repository -echo "๐Ÿ™ Cloning the repository..." -git clone https://github.com/fabriziosalmi/proxmox-lxc-autoscale - -# Step 4: Copy service file to systemd -echo "๐Ÿ“ Copying service file to systemd directory..." -cp proxmox-lxc-autoscale/lxc_autoscale_ml/api/lxc_autoscale_api.service /etc/systemd/system/lxc_autoscale_api.service - -# Step 5: Reload systemd daemon -echo "๐Ÿ”„ Reloading systemd daemon..." -systemctl daemon-reload - -# Step 6: Copy the necessary files to the appropriate directories -echo "๐Ÿ“‚ Copying Python scripts and configuration files..." -cp proxmox-lxc-autoscale/lxc_autoscale_ml/api/*.py /usr/local/bin/lxc_autoscale_api/ -cp proxmox-lxc-autoscale/lxc_autoscale_ml/api/config.yaml /etc/lxc_autoscale/lxc_autoscale_api.yaml - -# Step 7: Enable and start the service -echo "๐Ÿš€ Enabling and starting the autoscaleapi service..." -systemctl enable lxc_autoscale_api.service -systemctl start lxc_autoscale_api.service -systemctl status lxc_autoscale_api.service - -# Step 10: Clean up the cloned repository -echo "๐Ÿงน Cleaning up..." -rm -rf proxmox-lxc-autoscale - -echo "โœ… Installation complete. The LXC AutoScale API service is now running. ๐ŸŽ‰" diff --git a/lxc_autoscale_ml/install_model.sh b/lxc_autoscale_ml/install_model.sh deleted file mode 100644 index a80ea9f..0000000 --- a/lxc_autoscale_ml/install_model.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash - -# Variables - -# Monitor -REPO_BASE_URL="https://raw.githubusercontent.com/fabriziosalmi/proxmox-lxc-autoscale/main" -SCRIPT_URL="${REPO_BASE_URL}/lxc_autoscale_ml/model/lxc_autoscale_ml.py" -SERVICE_URL="${REPO_BASE_URL}/lxc_autoscale_ml/model/lxc_autoscale_ml.service" -CONF_URL="${REPO_BASE_URL}/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml" - -INSTALL_PATH="/usr/local/bin/lxc_autoscale_ml.py" -SERVICE_PATH="/etc/systemd/system/lxc_autoscale_ml.service" -CONF_DIR="/etc/lxc_autoscale" -YAML_CONF_PATH="${CONF_DIR}/lxc_autoscale_ml.yaml" - -# Function to check and stop the service if running -stop_service_if_running() { - if systemctl is-active --quiet lxc_autoscale_ml.service; then - echo "๐Ÿ›‘ Stopping LXC AutoScale ML service..." - systemctl stop lxc_autoscale_ml.service - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to stop the service." - exit 1 - fi - fi -} - -# Function to start the service -start_service() { - echo "๐Ÿš€ Starting the LXC AutoScale ML service..." - systemctl start lxc_autoscale_ml.service - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to start the service." - exit 1 - fi -} - -# Function to enable the service -enable_service() { - echo "๐Ÿ”ง Enabling the LXC AutoScale ML service..." - systemctl enable lxc_autoscale_ml.service - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to enable the service." - exit 1 - fi -} - -# Function to backup existing configuration file -backup_existing_conf() { - if [ -f "$YAML_CONF_PATH" ]; then - timestamp=$(date +"%Y%m%d-%H%M%S") - backup_conf="${YAML_CONF_PATH}.${timestamp}.backup" - echo "๐Ÿ’พ Backing up existing configuration file to $backup_conf..." - cp "$YAML_CONF_PATH" "$backup_conf" - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to backup the existing configuration file." - exit 1 - fi - fi -} - -# Stop the service if it's already running -stop_service_if_running - -# Download the main Python script -echo "๐Ÿ“ฅ Downloading the LXC AutoScale ML main script..." -curl -sSL -o $INSTALL_PATH $SCRIPT_URL -if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to download the main script." - exit 1 -fi - -# Make the main script executable -chmod +x $INSTALL_PATH - -# Download the systemd service file -echo "๐Ÿ“ฅ Downloading the systemd service file..." -curl -sSL -o $SERVICE_PATH $SERVICE_URL -if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to download the service file." - exit 1 -fi - -# Set up the configuration directory and file, with backup if needed -echo "๐Ÿ“‚ Setting up configuration directory and file..." -mkdir -p $CONF_DIR - backup_existing_conf - curl -sSL -o $YAML_CONF_PATH $CONF_URL - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to download the configuration file." - exit 1 - fi - -# Reload systemd to recognize the new service -echo "๐Ÿ”„ Reloading systemd daemon..." -systemctl daemon-reload - -# Enable and start the LXC AutoScale service -enable_service -start_service - -# Check the status of the service -echo "๐Ÿ” Checking service status..." -systemctl status lxc_autoscale_ml.service --no-pager - -# Verify that the service is running -if systemctl is-active --quiet lxc_autoscale_ml.service; then - echo "โœ… LXC AutoScale ML service is running successfully." -else - echo "โŒ Error: LXC AutoScale ML service failed to start." - exit 1 -fi - -echo "๐ŸŽ‰ Installation and setup completed successfully." diff --git a/lxc_autoscale_ml/install_monitor.sh b/lxc_autoscale_ml/install_monitor.sh deleted file mode 100644 index 2372b64..0000000 --- a/lxc_autoscale_ml/install_monitor.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash - -# Variables - -# Monitor -REPO_BASE_URL="https://raw.githubusercontent.com/fabriziosalmi/proxmox-lxc-autoscale/main" -SCRIPT_URL="${REPO_BASE_URL}/lxc_autoscale_ml/monitor/lxc_monitor.py" -SERVICE_URL="${REPO_BASE_URL}/lxc_autoscale_ml/monitor/lxc_monitor.service" -CONF_URL="${REPO_BASE_URL}/lxc_autoscale_ml/monitor/lxc_monitor.yaml" - -INSTALL_PATH="/usr/local/bin/lxc_monitor.py" -SERVICE_PATH="/etc/systemd/system/lxc_monitor.service" -CONF_DIR="/etc/lxc_autoscale" -YAML_CONF_PATH="${CONF_DIR}/lxc_monitor.yaml" -LOG_PATH="/var/log/lxc_monitor.log" - -# Function to check and stop the service if running -stop_service_if_running() { - if systemctl is-active --quiet lxc_monitor.service; then - echo "๐Ÿ›‘ Stopping LXC AutoScale Monitor service..." - systemctl stop lxc_monitor.service - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to stop the service." - exit 1 - fi - fi -} - -# Function to start the service -start_service() { - echo "๐Ÿš€ Starting the LXC AutoScale Monitor service..." - systemctl start lxc_monitor.service - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to start the service." - exit 1 - fi -} - -# Function to enable the service -enable_service() { - echo "๐Ÿ”ง Enabling the LXC AutoScale Monitor service..." - systemctl enable lxc_monitor.service - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to enable the service." - exit 1 - fi -} - -# Function to backup existing configuration file -backup_existing_conf() { - if [ -f "$YAML_CONF_PATH" ]; then - timestamp=$(date +"%Y%m%d-%H%M%S") - backup_conf="${YAML_CONF_PATH}.${timestamp}.backup" - echo "๐Ÿ’พ Backing up existing configuration file to $backup_conf..." - cp "$YAML_CONF_PATH" "$backup_conf" - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to backup the existing configuration file." - exit 1 - fi - fi -} - -# Stop the service if it's already running -stop_service_if_running - -# Download the main Python script -echo "๐Ÿ“ฅ Downloading the LXC AutoScale Monitor main script..." -curl -sSL -o $INSTALL_PATH $SCRIPT_URL -if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to download the main script." - exit 1 -fi - -# Make the main script executable -chmod +x $INSTALL_PATH - -# Download the systemd service file -echo "๐Ÿ“ฅ Downloading the systemd service file..." -curl -sSL -o $SERVICE_PATH $SERVICE_URL -if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to download the service file." - exit 1 -fi - -# Set up the configuration directory and file, with backup if needed -echo "๐Ÿ“‚ Setting up configuration directory and file..." -mkdir -p $CONF_DIR - backup_existing_conf - curl -sSL -o $YAML_CONF_PATH $CONF_URL - if [ $? -ne 0 ]; then - echo "โŒ Error: Failed to download the configuration file." - exit 1 - fi - -# Create the log file if it doesn't exist -touch $LOG_PATH - -# Set the correct permissions -echo "๐Ÿ”ง Setting permissions..." -chown root:root $LOG_PATH -chmod 644 $LOG_PATH - -# Reload systemd to recognize the new service -echo "๐Ÿ”„ Reloading systemd daemon..." -systemctl daemon-reload - -# Enable and start the LXC AutoScale service -enable_service -start_service - -# Check the status of the service -echo "๐Ÿ” Checking service status..." -systemctl status lxc_monitor.service --no-pager - -# Verify that the service is running -if systemctl is-active --quiet lxc_monitor.service; then - echo "โœ… LXC Monitor service is running successfully." -else - echo "โŒ Error: LXC Monitor service failed to start." - exit 1 -fi - -echo "๐ŸŽ‰ Installation and setup completed successfully." diff --git a/lxc_autoscale_ml/model/README.md b/lxc_autoscale_ml/model/README.md deleted file mode 100644 index c947534..0000000 --- a/lxc_autoscale_ml/model/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# LXC AutoScale ML (the model) - -- [Documentation](https://github.com/fabriziosalmi/proxmox-lxc-autoscale/blob/main/docs/lxc_model/README.md) diff --git a/lxc_autoscale_ml/model/config_manager.py b/lxc_autoscale_ml/model/config_manager.py deleted file mode 100644 index e97af79..0000000 --- a/lxc_autoscale_ml/model/config_manager.py +++ /dev/null @@ -1,39 +0,0 @@ -import yaml -import logging -import os - -class ConfigError(Exception): - pass - -def load_config(config_path, default_config=None): - if not os.path.exists(config_path): - logging.error(f"Configuration file not found: {config_path}") - raise ConfigError(f"Configuration file not found: {config_path}") - - try: - with open(config_path, 'r') as file: - config = yaml.safe_load(file) - logging.info(f"Configuration loaded from {config_path}") - except yaml.YAMLError as e: - logging.error(f"Error parsing YAML file: {e}") - raise ConfigError(f"Error parsing YAML file: {e}") - except Exception as e: - logging.error(f"Unexpected error loading configuration: {e}") - raise ConfigError(f"Unexpected error loading configuration: {e}") - - if default_config: - config = {**default_config, **config} - logging.debug(f"Configuration merged with default values.") - - required_keys = ['log_file', 'interval_seconds', 'api'] - for key in required_keys: - if key not in config: - logging.error(f"Missing required configuration key: {key}") - raise ConfigError(f"Missing required configuration key: {key}") - - if 'api_url' not in config['api']: - logging.error("Missing required configuration key: api_url") - raise ConfigError("Missing required configuration key: api_url") - - logging.debug(f"Final configuration: {config}") - return config diff --git a/lxc_autoscale_ml/model/data_manager.py b/lxc_autoscale_ml/model/data_manager.py deleted file mode 100644 index 6e61b9a..0000000 --- a/lxc_autoscale_ml/model/data_manager.py +++ /dev/null @@ -1,135 +0,0 @@ -import pandas as pd -import logging -import json -import numpy as np -from sklearn.preprocessing import StandardScaler - -def load_data(file_path): - try: - with open(file_path, 'r') as f: - data = json.load(f) - logging.info(f"Data loaded successfully from {file_path}.") - except FileNotFoundError: - logging.error(f"File not found: {file_path}") - return None - except json.JSONDecodeError as e: - logging.error(f"JSON decoding failed: {e}") - return None - except Exception as e: - logging.error(f"Unexpected error loading data: {e}") - return None - - if not data: - logging.error(f"No data found in {file_path}.") - return None - - records = [] - for snapshot in data: - for container_id, metrics in snapshot.items(): - if container_id == "summary": - continue # Skip the summary, focus on container data - - try: - record = { - "container_id": container_id, - "timestamp": metrics["timestamp"], - "cpu_usage_percent": metrics["cpu_usage_percent"], - "memory_usage_mb": metrics["memory_usage_mb"], - "swap_usage_mb": metrics.get("swap_usage_mb", 0), - "swap_total_mb": metrics.get("swap_total_mb", 0), - "process_count": metrics["process_count"], - "io_reads": metrics["io_stats"]["reads"], - "io_writes": metrics["io_stats"]["writes"], - "network_rx_bytes": metrics["network_usage"]["rx_bytes"], - "network_tx_bytes": metrics["network_usage"]["tx_bytes"], - "filesystem_usage_gb": metrics["filesystem_usage_gb"], - "filesystem_total_gb": metrics["filesystem_total_gb"], - "filesystem_free_gb": metrics["filesystem_free_gb"], - } - records.append(record) - except KeyError as e: - logging.warning(f"Missing expected metric {e} in container {container_id}. Skipping.") - continue - - if not records: - logging.error("No valid records found in the data.") - return None - - df = pd.DataFrame(records) - try: - df['timestamp'] = pd.to_datetime(df['timestamp']) - except Exception as e: - logging.error(f"Error converting timestamps: {e}") - return None - - logging.info("Data preprocessed successfully.") - return df - - -def preprocess_data(df, config): - if df.empty: - logging.error("DataFrame is empty. Skipping preprocessing.") - return df - - try: - spike_threshold = config.get('spike_detection', {}).get('spike_threshold', 2) - rolling_window_size = config.get('rolling_window', 5) - - # Derived metrics - df['cpu_per_process'] = df['cpu_usage_percent'] / df['process_count'] - df['memory_per_process'] = df['memory_usage_mb'] / df['process_count'] - df['cpu_memory_ratio'] = df['cpu_usage_percent'] / df['memory_usage_mb'] # New metric: CPU to Memory Ratio - - # Fix for time_diff calculation - try: - df['time_diff'] = df.groupby('container_id')['timestamp'].diff().dt.total_seconds() - df['time_diff'].fillna(0, inplace=True) - except Exception as e: - logging.error(f"Error calculating 'time_diff': {e}") - df['time_diff'] = 0 # Default to 0 in case of errors - - # Rolling statistics for spike detection and trend analysis - df['rolling_mean_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).mean()) - df['rolling_std_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).std()).fillna(0) - df['rolling_mean_memory'] = df.groupby('container_id')['memory_usage_mb'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).mean()) - df['rolling_std_memory'] = df.groupby('container_id')['memory_usage_mb'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).std()).fillna(0) - - # Spike detection - df['cpu_spike'] = np.abs(df['cpu_usage_percent'] - df['rolling_mean_cpu']) > (spike_threshold * df['rolling_std_cpu']) - df['memory_spike'] = np.abs(df['memory_usage_mb'] - df['rolling_mean_memory']) > (spike_threshold * df['rolling_std_memory']) - - # Trend detection using the slope of the rolling window - df['cpu_trend'] = df.groupby('container_id')['cpu_usage_percent'].transform( - lambda x: np.polyfit(np.arange(len(x)), x.rolling(window=rolling_window_size, min_periods=1).mean(), 1)[0]) - df['memory_trend'] = df.groupby('container_id')['memory_usage_mb'].transform( - lambda x: np.polyfit(np.arange(len(x)), x.rolling(window=rolling_window_size, min_periods=1).mean(), 1)[0]) - - # Aggregated features - df['max_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).max()) - df['min_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).min()) - df['max_memory'] = df.groupby('container_id')['memory_usage_mb'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).max()) - df['min_memory'] = df.groupby('container_id')['memory_usage_mb'].transform( - lambda x: x.rolling(window=rolling_window_size, min_periods=1).min()) - - # Feature scaling - scaler = StandardScaler() - features_to_scale = [ - 'cpu_usage_percent', 'memory_usage_mb', 'cpu_per_process', 'memory_per_process', 'time_diff', - 'cpu_trend', 'memory_trend', 'max_cpu', 'min_cpu', 'max_memory', 'min_memory', - 'cpu_memory_ratio' - ] - df[features_to_scale] = scaler.fit_transform(df[features_to_scale]) - - logging.info("Feature engineering, spike detection, and trend detection completed.") - except Exception as e: - logging.error(f"Error during data preprocessing: {e}") - return df - - return df diff --git a/lxc_autoscale_ml/model/lock_manager.py b/lxc_autoscale_ml/model/lock_manager.py deleted file mode 100644 index 1e29b40..0000000 --- a/lxc_autoscale_ml/model/lock_manager.py +++ /dev/null @@ -1,19 +0,0 @@ -import os -import sys -import logging - -def create_lock_file(lock_file): - if os.path.exists(lock_file): - logging.error("Another instance of the script is already running. Exiting.") - sys.exit(1) - else: - with open(lock_file, 'w') as lf: - lf.write(str(os.getpid())) - logging.info(f"Lock file created at {lock_file}.") - -def remove_lock_file(lock_file): - if os.path.exists(lock_file): - os.remove(lock_file) - logging.info(f"Lock file {lock_file} removed.") - else: - logging.warning(f"Lock file {lock_file} was not found.") diff --git a/lxc_autoscale_ml/model/logger.py b/lxc_autoscale_ml/model/logger.py deleted file mode 100644 index e1772b7..0000000 --- a/lxc_autoscale_ml/model/logger.py +++ /dev/null @@ -1,35 +0,0 @@ -import logging -from logging.handlers import RotatingFileHandler - -import logging - -def setup_logging(log_file, log_level="INFO"): - """ - Set up logging to log to both a file and the console. - - :param log_file: Path to the log file. - :param log_level: The logging level (default is "INFO"). - """ - # Convert log level string to logging level - log_level = getattr(logging, log_level.upper(), logging.INFO) - - # Create a logger - logger = logging.getLogger() - logger.setLevel(log_level) - - # Create file handler - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(log_level) - - # Create console handler - console_handler = logging.StreamHandler() - console_handler.setLevel(log_level) - - # Define log format - formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s') - file_handler.setFormatter(formatter) - console_handler.setFormatter(formatter) - - # Add handlers to the logger - logger.addHandler(file_handler) - logger.addHandler(console_handler) diff --git a/lxc_autoscale_ml/model/lxc_autoscale_ml.py b/lxc_autoscale_ml/model/lxc_autoscale_ml.py deleted file mode 100644 index 1745eeb..0000000 --- a/lxc_autoscale_ml/model/lxc_autoscale_ml.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import time -import logging -import pandas as pd - -# Ensure all modules in the lxc_autoscale_ml directory are accessible -sys.path.append('/usr/local/bin/lxc_autoscale_ml') - -# Import custom modules -from logger import setup_logging -from lock_manager import create_lock_file, remove_lock_file -from config_manager import load_config -from data_manager import load_data, preprocess_data -from model import train_anomaly_models, predict_anomalies -from scaling import determine_scaling_action, apply_scaling -from signal_handler import setup_signal_handlers - - -def main(): - config = load_config("/etc/lxc_autoscale_ml/lxc_autoscale_ml.yaml") - setup_logging(config.get("log_file", "/var/log/lxc_autoscale_ml.log")) - - logging.info("Starting the LXC auto-scaling script...") - - create_lock_file(config.get("lock_file", "/tmp/lxc_autoscale_ml.lock")) - - try: - while True: - # Load and preprocess data - df = load_data(config.get("data_file", "/var/log/lxc_metrics.json")) - if df is None: - logging.error("Exiting due to data loading error.") - return - - df = preprocess_data(df, config) - - # Train anomaly detection models - model, features_to_use = train_anomaly_models(df, config) - if model is None: - logging.error("Model training failed. Exiting.") - return - - logging.info("Processing containers for scaling decisions...") - - # Iterate over each container and make scaling decisions - for container_id in df["container_id"].unique(): - container_data = df[df["container_id"] == container_id] - latest_metrics = container_data.iloc[-1] - - logging.debug(f"Latest metrics for container {container_id}: {latest_metrics.to_dict()}") - - scaling_decision, confidence = predict_anomalies(model, latest_metrics, features_to_use, config) - - if scaling_decision is not None: - cpu_action, ram_action, new_cores, new_ram = determine_scaling_action(latest_metrics, scaling_decision, confidence, config) - logging.debug(f"Scaling decision for container {container_id}: CPU - {cpu_action}, RAM - {ram_action} | Confidence: {confidence:.2f}%") - - if cpu_action != "No Scaling" or ram_action != "No Scaling": - logging.info(f"Applying scaling actions for container {container_id}: CPU - {cpu_action}, RAM - {ram_action} | Confidence: {confidence:.2f}%") - apply_scaling(container_id, new_cores, new_ram, config) - else: - logging.info(f"No scaling needed for container {container_id}. | Confidence: {confidence:.2f}%") - else: - logging.warning(f"Skipping scaling for container {container_id} due to lack of prediction.") - - # Sleep until the next interval - logging.info(f"Sleeping for {config.get('interval_seconds', 60)} seconds before the next run.") - time.sleep(config.get("interval_seconds", 60)) - - except Exception as e: - logging.error(f"An error occurred: {e}") - finally: - remove_lock_file(config.get("lock_file", "/tmp/lxc_autoscale_ml.lock")) - logging.info("Script execution completed.") - -if __name__ == "__main__": - # Setup signal handlers to ensure graceful shutdown - setup_signal_handlers() - main() diff --git a/lxc_autoscale_ml/model/lxc_autoscale_ml.service b/lxc_autoscale_ml/model/lxc_autoscale_ml.service deleted file mode 100644 index 982ff40..0000000 --- a/lxc_autoscale_ml/model/lxc_autoscale_ml.service +++ /dev/null @@ -1,19 +0,0 @@ -[Unit] -Description=LXC AutoScale ML Service -Documentation=https://github.com/fabriziosalmi/proxmox-lxc-autoscale -After=network.target - -[Service] -ExecStart=/usr/bin/python3 /usr/local/bin/lxc_autoscale_ml/lxc_autoscale_ml.py -WorkingDirectory=/usr/local/bin/lxc_autoscale_ml -StandardOutput=inherit -StandardError=inherit -Restart=on-failure -User=root - -# Logging configuration -Environment="PYTHONUNBUFFERED=1" -EnvironmentFile=/etc/lxc_autoscale_ml/lxc_autoscale_ml.yaml - -[Install] -WantedBy=multi-user.target diff --git a/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml b/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml deleted file mode 100644 index 77a2997..0000000 --- a/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# ============================================================= -# LXC AutoScale ML -# Automated Container Scaling Configuration -# ------------------------------------------------------------- -# This configuration file controls the behavior of the LXC -# AutoScale ML script. It includes settings for logging, -# model training, spike detection, scaling actions, API -# interactions, and more. -# -# The script is designed to automatically scale LXC containers -# based on real-time metrics, such as CPU usage, memory usage, -# and other system indicators. The scaling decisions are made -# using machine learning models, which can detect anomalies -# and trends in the data. -# -# Author: Fabrizio Salmi - fabrizio.salmi@gmail.com -# Date: August 20, 2024 -# ============================================================= - -# Logging Configuration -log_file: "/var/log/lxc_autoscale_ml.log" # Path to the log file -log_level: "DEBUG" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) - -# Lock File Configuration -lock_file: "/tmp/lxc_autoscale_ml.lock" # Path to the lock file to prevent multiple instances - -# Data File Configuration -data_file: "/var/log/lxc_metrics.json" # Path to the metrics file containing container data produced by LXC AutoScale API - -# Model Configuration -model: - contamination: 0.05 # Contamination level for IsolationForest (fraction of outliers) - n_estimators: 100 # Number of trees in IsolationForest - max_samples: 64 # Number of samples to draw for training each tree - random_state: 42 # Random seed for reproducibility - -# Spike Detection Configuration -spike_detection: - spike_threshold: 2 # Number of standard deviations for spike detection - rolling_window: 5 # Window size for rolling mean and standard deviation - -# Scaling Configuration -scaling: - total_cores: 4 # Total number of CPU cores available on the server - total_ram_mb: 16384 # Total RAM available on the server in MB - target_cpu_load_percent: 50 # Target CPU load percentage after scaling - max_cpu_cores: 4 # Maximum number of CPU cores to maintain per container - max_ram_mb: 8192 # Maximum RAM to maintain per container in MB - min_cpu_cores: 2 # Minimum number of CPU cores to maintain per container - min_ram_mb: 1024 # Minimum RAM to maintain per container in MB - cpu_scale_up_threshold: 75 # CPU usage percentage to trigger scale-up - cpu_scale_down_threshold: 30 # CPU usage percentage to trigger scale-down - ram_scale_up_threshold: 75 # RAM usage percentage to trigger scale-up - ram_scale_down_threshold: 30 # RAM usage percentage to trigger scale-down - ram_chunk_size: 50 # Minimum RAM scaling chunk size in MB - ram_upper_limit: 1024 # Maximum RAM scaling limit in one step in MB - dry_run: false # If true, perform a dry run without making actual API calls - -# API Configuration -api: - api_url: "http://127.0.0.1:5000" # Base URL for the API used for scaling actions - cores_endpoint: "/scale/cores" # Endpoint for scaling CPU cores - ram_endpoint: "/scale/ram" # Endpoint for scaling RAM - -# Retry Logic for API Calls -retry_logic: - max_retries: 3 # Maximum number of retries for API calls - retry_delay: 2 # Delay between retries in seconds - -# Interval Configuration -interval_seconds: 60 # Time interval between consecutive script runs in seconds - -# Feature Engineering Configuration -feature_engineering: - include_io_activity: true # Include IO activity as a feature in the model - include_network_activity: true # Include network activity as a feature in the model - -# Prediction Configuration -prediction: - use_latest_only: true # If true, use only the latest data point for prediction - include_rolling_features: true # Include rolling mean and std features for prediction - -# Ignored Containers -ignore_lxc: - - "101" # List of container IDs to ignore from scaling - - "102" diff --git a/lxc_autoscale_ml/model/model.py b/lxc_autoscale_ml/model/model.py deleted file mode 100644 index 36caeca..0000000 --- a/lxc_autoscale_ml/model/model.py +++ /dev/null @@ -1,53 +0,0 @@ -from sklearn.ensemble import IsolationForest -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -import logging -import numpy as np -import pandas as pd - -def train_anomaly_models(df, config): - # Select only numeric features for training, excluding non-relevant columns - features_to_use = df.select_dtypes(include=[np.number]).columns.difference(['container_id', 'timestamp']) - X_train = df[features_to_use] - - logging.info(f"Features used for training: {list(features_to_use)}") - - model = IsolationForest( - contamination=config.get('model', {}).get('contamination', 0.05), - n_estimators=config.get('model', {}).get('n_estimators', 100), - max_samples=config.get('model', {}).get('max_samples', 64), - random_state=config.get('model', {}).get('random_state', 42) - ) - - try: - pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('IsolationForest', model) - ]) - pipeline.fit(X_train) - logging.info("IsolationForest model training completed.") - return pipeline, features_to_use # Return both the model and the feature set used - except Exception as e: - logging.error(f"Error during model training: {e}") - return None, None - - -def predict_anomalies(model, latest_metrics, features_to_use, config): - # Ensure only the relevant features are used for prediction - latest_metrics_df = latest_metrics[features_to_use].to_frame().T - - logging.debug(f"Features used for prediction: {latest_metrics_df.columns.tolist()}") - - try: - anomaly_score = model.decision_function(latest_metrics_df) - # Convert anomaly_score to a scalar if it's an array - anomaly_score = anomaly_score.item() if isinstance(anomaly_score, np.ndarray) else anomaly_score - # Normalize the anomaly score to a confidence level (0 to 100%) - confidence = (1 - anomaly_score) * 100 # Inverse because lower scores mean more abnormal - logging.debug(f"Anomaly score: {anomaly_score}, Confidence: {confidence:.2f}%") - prediction = model.predict(latest_metrics_df) - prediction = prediction.item() if isinstance(prediction, np.ndarray) else prediction - return prediction, confidence - except Exception as e: - logging.error(f"Error during prediction with IsolationForest: {e}") - return None, 0 diff --git a/lxc_autoscale_ml/model/scaling.py b/lxc_autoscale_ml/model/scaling.py deleted file mode 100644 index 1a86ed0..0000000 --- a/lxc_autoscale_ml/model/scaling.py +++ /dev/null @@ -1,107 +0,0 @@ -import requests -import logging -import time -import sys -import pandas as pd - -# Ensure all modules in the lxc_autoscale_ml directory are accessible -sys.path.append('/usr/local/bin/lxc_autoscale_ml') - -# Import custom modules -from logger import setup_logging -from lock_manager import create_lock_file, remove_lock_file -from config_manager import load_config -from model import train_anomaly_models, predict_anomalies -from signal_handler import setup_signal_handlers - -def determine_scaling_action(latest_metrics, scaling_decision, confidence, config): - cpu_action = "No Scaling" - ram_action = "No Scaling" - new_cores = None - new_ram = None - - cpu_usage = latest_metrics["cpu_usage_percent"] - memory_usage = latest_metrics["memory_usage_mb"] - cpu_memory_ratio = latest_metrics.get("cpu_memory_ratio", None) - io_ops_per_second = latest_metrics.get("io_ops_per_second", None) - - logging.debug(f"CPU usage: {cpu_usage}% | Memory usage: {memory_usage}MB | Confidence: {confidence}%") - - cpu_thresholds = config["scaling"] - ram_thresholds = config["scaling"] - - if scaling_decision: - logging.debug("Anomaly detected. Scaling up CPU and RAM.") - cpu_action = "Scale Up" - ram_action = "Scale Up" - else: - if cpu_usage > cpu_thresholds["cpu_scale_up_threshold"]: - cpu_action = "Scale Up" - logging.debug(f"CPU usage {cpu_usage}% exceeds the scale-up threshold.") - elif cpu_usage < cpu_thresholds["cpu_scale_down_threshold"]: - cpu_action = "Scale Down" - logging.debug(f"CPU usage {cpu_usage}% is below the scale-down threshold.") - - if memory_usage > ram_thresholds["ram_scale_up_threshold"]: - ram_action = "Scale Up" - logging.debug(f"Memory usage {memory_usage}MB exceeds the scale-up threshold.") - elif memory_usage < ram_thresholds["ram_scale_down_threshold"]: - ram_action = "Scale Down" - logging.debug(f"Memory usage {memory_usage}MB is below the scale-down threshold.") - - # Ensure scaling stays within limits - if cpu_action == "Scale Up": - new_cores = min(cpu_thresholds["total_cores"], cpu_thresholds["max_cpu_cores"]) - elif cpu_action == "Scale Down": - new_cores = max(cpu_thresholds["min_cpu_cores"], cpu_thresholds["min_cpu_cores"]) - - if ram_action == "Scale Up": - new_ram = min(ram_thresholds["total_ram_mb"], ram_thresholds["max_ram_mb"]) - elif ram_action == "Scale Down": - new_ram = max(ram_thresholds["min_ram_mb"], ram_thresholds["min_ram_mb"]) - - logging.debug(f"Final scaling actions: CPU -> {cpu_action}, RAM -> {ram_action} | Confidence: {confidence}%") - return cpu_action, ram_action, new_cores, new_ram - - - - -def apply_scaling(lxc_id, new_cores, new_ram, config): - max_retries = config.get("retry_logic", {}).get("max_retries", 3) - retry_delay = config.get("retry_logic", {}).get("retry_delay", 2) - base_url = config["api"]["api_url"] - cores_endpoint = config["api"].get("cores_endpoint", "/scale/cores") - ram_endpoint = config["api"].get("ram_endpoint", "/scale/ram") - - def perform_request(url, data, resource_type): - resource_key = "cores" if resource_type == "CPU" else "memory" - for attempt in range(max_retries): - try: - response = requests.post(url, json=data) - response.raise_for_status() - logging.info(f"Successfully scaled {resource_type} for LXC ID {lxc_id} to {data[resource_key]} {resource_type} units.") - return True - except requests.RequestException as e: - if response.status_code == 500: - logging.error(f"Server error (500) encountered on attempt {attempt + 1} to scale {resource_type} for LXC ID {lxc_id}. Aborting further attempts.") - break # Skip further retries for 500 errors - logging.error(f"Attempt {attempt + 1} failed to scale {resource_type} for LXC ID {lxc_id}: {e}") - if attempt < max_retries - 1: - logging.info(f"Retrying in {retry_delay} seconds...") - time.sleep(retry_delay) - else: - logging.error(f"Scaling {resource_type} for LXC ID {lxc_id} failed after {max_retries} attempts.") - return False - return False - - if new_cores is not None: - cpu_data = {"vm_id": lxc_id, "cores": new_cores} - cpu_url = f"{base_url}{cores_endpoint}" - if not perform_request(cpu_url, cpu_data, "CPU"): - logging.error(f"Scaling operation aborted for LXC ID {lxc_id} due to CPU scaling failure.") - - if new_ram is not None: - ram_data = {"vm_id": lxc_id, "memory": new_ram} - ram_url = f"{base_url}{ram_endpoint}" - if not perform_request(ram_url, ram_data, "RAM"): - logging.error(f"Scaling operation aborted for LXC ID {lxc_id} due to RAM scaling failure.") diff --git a/lxc_autoscale_ml/model/signal_handler.py b/lxc_autoscale_ml/model/signal_handler.py deleted file mode 100644 index 95cf7bf..0000000 --- a/lxc_autoscale_ml/model/signal_handler.py +++ /dev/null @@ -1,34 +0,0 @@ -import signal -import logging -import sys - -def setup_signal_handlers(cleanup_function=None): - """ - Sets up signal handlers to handle termination signals (SIGINT and SIGTERM) gracefully. - - :param cleanup_function: A function to be called for cleanup before exiting (optional). - """ - def signal_handler(signum, frame): - """ - Handles received signals, logs them, and exits the program gracefully. - - :param signum: The signal number received. - :param frame: The current stack frame (not used). - """ - signal_name = signal.Signals(signum).name # Get a more descriptive signal name - logging.info(f"Received {signal_name} ({signum}). Exiting gracefully.") - - # Call the cleanup function if provided - if cleanup_function: - logging.info("Performing cleanup before exiting...") - try: - cleanup_function() - except Exception as e: - logging.error(f"Error during cleanup: {e}") - - sys.exit(0) - - # Register signal handlers - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - logging.info("Signal handlers for SIGINT and SIGTERM are set up.") diff --git a/lxc_autoscale_ml/monitor/README.md b/lxc_autoscale_ml/monitor/README.md deleted file mode 100644 index ea3e167..0000000 --- a/lxc_autoscale_ml/monitor/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# LXC Monitor - -- [Documentation](https://github.com/fabriziosalmi/proxmox-lxc-autoscale/blob/main/docs/lxc_monitor/README.md) diff --git a/lxc_autoscale_ml/monitor/lxc_monitor.py b/lxc_autoscale_ml/monitor/lxc_monitor.py deleted file mode 100644 index 9065d4e..0000000 --- a/lxc_autoscale_ml/monitor/lxc_monitor.py +++ /dev/null @@ -1,327 +0,0 @@ -import asyncio -import json -import logging -import os -import yaml -import psutil -from logging.handlers import TimedRotatingFileHandler -from subprocess import check_output, CalledProcessError -from datetime import datetime -from concurrent.futures import ThreadPoolExecutor -from typing import Any, Dict, List, Optional, Tuple -import aiofiles - -# Load configuration from YAML file -with open("/etc/lxc_autoscale/lxc_monitor.yaml", 'r') as config_file: - config = yaml.safe_load(config_file) - -# Set up logging configuration -LOG_FILE = config['logging']['log_file'] -LOG_MAX_BYTES = config['logging']['log_max_bytes'] -LOG_BACKUP_COUNT = config['logging']['log_backup_count'] -LOG_LEVEL = getattr(logging, config['logging']['log_level'].upper(), logging.INFO) - -# Configure structured logging with time-based rotation and retention -logger = logging.getLogger("LXCMonitor") -logger.setLevel(LOG_LEVEL) -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - -# Console handler -console_handler = logging.StreamHandler() -console_handler.setFormatter(formatter) - -# Timed rotating file handler with log retention -file_handler = TimedRotatingFileHandler(LOG_FILE, when="midnight", interval=1, backupCount=LOG_BACKUP_COUNT) -file_handler.setFormatter(formatter) -file_handler.suffix = "%Y-%m-%d" - -# Add handlers to the logger -logger.addHandler(console_handler) -logger.addHandler(file_handler) - -# Monitoring configuration -EXPORT_FILE = config['monitoring']['export_file'] -CHECK_INTERVAL = config['monitoring']['check_interval'] -ENABLE_SWAP = config['monitoring']['enable_swap'] -ENABLE_NETWORK = config['monitoring']['enable_network'] -ENABLE_FILESYSTEM = config['monitoring']['enable_filesystem'] -PARALLEL_PROCESSING = config['monitoring']['parallel_processing'] -MAX_WORKERS = config['monitoring']['max_workers'] -EXCLUDED_DEVICES = config['monitoring']['excluded_devices'] -RETRY_LIMIT = config['monitoring'].get('retry_limit', 3) # Maximum retry attempts -RETRY_DELAY = config['monitoring'].get('retry_delay', 2) # Delay between retries in seconds - -def get_running_lxc_containers() -> List[str]: - """Retrieve a list of running LXC containers.""" - try: - pct_output = check_output(['pct', 'list'], text=True).splitlines() - return [line.split()[0] for line in pct_output if 'running' in line] - except CalledProcessError as e: - logger.error(f"Error retrieving LXC containers: {e}") - return [] - -def run_command(command: List[str]) -> Optional[str]: - """Run a shell command in a thread pool.""" - try: - return check_output(command, text=True) - except CalledProcessError as e: - logger.error(f"Command failed: {' '.join(command)}, error: {e}") - return None - -async def retry_on_failure(func: Any, *args, **kwargs) -> Any: - """Retry logic wrapper for transient failures.""" - for attempt in range(RETRY_LIMIT): - try: - return await func(*args, **kwargs) - except Exception as e: - logger.warning(f"Attempt {attempt + 1} failed for {func.__name__} with error: {e}") - if attempt < RETRY_LIMIT - 1: - await asyncio.sleep(RETRY_DELAY) - else: - logger.error(f"All {RETRY_LIMIT} attempts failed for {func.__name__}.") - raise - -async def get_container_metric(command: List[str], executor: ThreadPoolExecutor) -> Optional[str]: - """Helper function to execute commands asynchronously in containers.""" - return await asyncio.get_event_loop().run_in_executor(executor, run_command, command) - -async def parse_meminfo(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, float]: - """Retrieve memory and swap usage inside the container.""" - mem_info = {} - for metric, key in [('MemTotal', 'memory_usage_mb'), ('MemAvailable', 'memory_free_mb'), - ('SwapTotal', 'swap_total_mb'), ('SwapFree', 'swap_free_mb')]: - command = ['pct', 'exec', container_id, '--', 'grep', metric, '/proc/meminfo'] - result = await get_container_metric(command, executor) - if result: - try: - mem_info[key] = int(result.split()[1]) / 1024 # Convert to MB - except ValueError: - logger.warning(f"Unexpected memory info format for container {container_id}: {result}") - # Calculate used memory and swap usage - mem_info['memory_usage_mb'] = mem_info.get('memory_usage_mb', 0.0) - mem_info.get('memory_free_mb', 0.0) - mem_info['swap_usage_mb'] = mem_info.get('swap_total_mb', 0.0) - mem_info.get('swap_free_mb', 0.0) - return mem_info - -async def get_container_cpu_usage(container_id: str, executor: ThreadPoolExecutor) -> float: - """Use pct exec to retrieve CPU usage inside the container.""" - command = ['pct', 'exec', container_id, '--', 'grep', 'cpu ', '/proc/stat'] - result = await get_container_metric(command, executor) - if result: - fields = result.split() - if len(fields) < 5: - logger.warning(f"Unexpected CPU stat format for container {container_id}: {fields}") - return 0.0 - idle_time = int(fields[4]) - total_time = sum(int(field) for field in fields[1:]) - return 100 * (1 - (idle_time / total_time)) - return 0.0 - -async def get_container_io_stats(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, int]: - """Retrieve I/O statistics inside the container.""" - command = ['pct', 'exec', container_id, '--', 'grep', '', '/proc/diskstats'] - result = await get_container_metric(command, executor) - if result: - io_stats_lines = result.splitlines() - io_stats = {"reads": 0, "writes": 0} - for line in io_stats_lines: - fields = line.split() - if len(fields) < 10: - logger.warning(f"Unexpected disk stats format for container {container_id}: {fields}") - continue - device = fields[2] - if any(device.startswith(exclude) for exclude in EXCLUDED_DEVICES): - continue - io_stats["reads"] += int(fields[5]) - io_stats["writes"] += int(fields[9]) - return io_stats - return {} - -async def get_container_network_usage(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, int]: - """Retrieve network usage inside the container.""" - if not ENABLE_NETWORK: - return {"rx_bytes": 0, "tx_bytes": 0} - - command = ['pct', 'exec', container_id, '--', 'cat', '/proc/net/dev'] - result = await get_container_metric(command, executor) - if result: - net_stats_lines = result.splitlines()[2:] # Skip headers - rx_bytes, tx_bytes = 0, 0 - for line in net_stats_lines: - fields = line.split() - if len(fields) < 10: - logger.warning(f"Unexpected network stats format for container {container_id}: {fields}") - continue - iface = fields[0].split(':')[0] - if iface != 'lo': # Ignore loopback interface - rx_bytes += int(fields[1]) - tx_bytes += int(fields[9]) - return {"rx_bytes": rx_bytes, "tx_bytes": tx_bytes} - return {"rx_bytes": 0, "tx_bytes": 0} - -async def get_container_filesystem_usage(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, float]: - """Retrieve filesystem usage inside the container.""" - if not ENABLE_FILESYSTEM: - return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0} - - command = ['pct', 'exec', container_id, '--', 'df', '-m', '/'] - result = await get_container_metric(command, executor) - if result: - lines = result.splitlines() - if len(lines) < 2: - logger.warning(f"Unexpected filesystem stats format for container {container_id}: {lines}") - return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0} - - filesystem_stats = lines[1].split() - if len(filesystem_stats) < 4: - logger.warning(f"Incomplete filesystem stats for container {container_id}: {filesystem_stats}") - return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0} - - filesystem_total_gb = int(filesystem_stats[1]) / 1024 # Convert MB to GB - filesystem_usage_gb = int(filesystem_stats[2]) / 1024 # Convert MB to GB - filesystem_free_gb = int(filesystem_stats[3]) / 1024 # Convert MB to GB - - return { - "filesystem_usage_gb": filesystem_usage_gb, - "filesystem_total_gb": filesystem_total_gb, - "filesystem_free_gb": filesystem_free_gb - } - return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0} - -async def get_container_process_count(container_id: str, executor: ThreadPoolExecutor) -> int: - """Retrieve the number of processes running inside the container.""" - command = ['pct', 'exec', container_id, '--', 'ps', '-e'] - result = await get_container_metric(command, executor) - if result: - lines = result.splitlines() - if lines and any(header in lines[0] for header in ["PID", "TTY", "TIME", "CMD"]): - lines = lines[1:] # Remove header line - return len(lines) - return 0 - -async def collect_metrics_for_container(container_id: str, executor: ThreadPoolExecutor) -> Tuple[str, Dict[str, Any]]: - """Collect all metrics for a given container.""" - logger.info(f"Collecting metrics for container: {container_id}") - - # Use retry logic for each metric collection - cpu_usage = await retry_on_failure(get_container_cpu_usage, container_id, executor) - memory_swap_usage = await retry_on_failure(parse_meminfo, container_id, executor) - io_stats = await retry_on_failure(get_container_io_stats, container_id, executor) - network_usage = await retry_on_failure(get_container_network_usage, container_id, executor) - filesystem_usage = await retry_on_failure(get_container_filesystem_usage, container_id, executor) - process_count = await retry_on_failure(get_container_process_count, container_id, executor) - - container_metrics = { - "timestamp": datetime.now().isoformat(), - "cpu_usage_percent": cpu_usage, - "memory_usage_mb": memory_swap_usage.get("memory_usage_mb", 0.0), - "swap_usage_mb": memory_swap_usage.get("swap_usage_mb", 0.0), - "swap_total_mb": memory_swap_usage.get("swap_total_mb", 0.0), - "process_count": process_count, - "io_stats": io_stats, - "network_usage": network_usage, - "filesystem_usage_gb": filesystem_usage["filesystem_usage_gb"], - "filesystem_total_gb": filesystem_usage["filesystem_total_gb"], - "filesystem_free_gb": filesystem_usage["filesystem_free_gb"] - } - - logger.info(f"Metrics for {container_id}: {container_metrics}") - - return container_id, container_metrics - -async def collect_and_export_metrics(): - """Collect and export metrics for all running LXC containers.""" - start_time = datetime.now() - metrics = {} - containers = get_running_lxc_containers() - - if not containers: - logger.info("No running LXC containers found.") - return - - logger.debug(f"Found {len(containers)} running containers.") - - async with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - if PARALLEL_PROCESSING: - tasks = [collect_metrics_for_container(container_id, executor) for container_id in containers] - results = await asyncio.gather(*tasks) - else: - results = [] - for container_id in containers: - result = await collect_metrics_for_container(container_id, executor) - results.append(result) - - # Ensure results are processed correctly - for container_id, container_metrics in results: - metrics[container_id] = container_metrics - - end_time = datetime.now() - total_duration = (end_time - start_time).total_seconds() - - # Add summary to the JSON output - summary = { - "collection_start_time": start_time.isoformat(), - "collection_end_time": end_time.isoformat(), - "total_containers": len(containers), - "total_duration_seconds": total_duration, - "monitor_cpu_percent": psutil.cpu_percent(), - "monitor_memory_usage_mb": psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - } - metrics["summary"] = summary - - # Load existing data if the file exists - existing_data = await load_existing_data(EXPORT_FILE) - - # Append the new metrics - existing_data.append(metrics) - - logger.debug(f"Appending new metrics: {metrics}") - - # Write the updated data to the file - await write_metrics_to_file(EXPORT_FILE, existing_data) - -async def load_existing_data(file_path: str) -> List[Dict[str, Any]]: - """Load existing data from the JSON file.""" - if not os.path.exists(file_path): - return [] - - try: - async with aiofiles.open(file_path, mode='r') as json_file: - content = await json_file.read() - data = json.loads(content) - if not isinstance(data, list): - logger.error(f"Data in {file_path} is not a list. Resetting to an empty list.") - return [] - logger.debug(f"Loaded existing metrics from {file_path}.") - return data - except (IOError, json.JSONDecodeError) as e: - logger.warning(f"Failed to read existing data from {file_path}: {e}") - return [] - -async def write_metrics_to_file(file_path: str, data: List[Dict[str, Any]]): - """Write metrics data to a JSON file asynchronously.""" - temp_file = f"{file_path}.tmp" - try: - async with aiofiles.open(temp_file, mode='w') as json_file: - await json_file.write(json.dumps(data, indent=4)) - os.replace(temp_file, file_path) - logger.info(f"Metrics successfully exported to {file_path}") - except IOError as e: - logger.error(f"Failed to write metrics to {file_path}: {e}") - if os.path.exists(temp_file): - os.remove(temp_file) - -async def monitor_and_export(): - """Continuously monitor and export metrics at the defined intervals.""" - try: - while True: - logger.info("Starting new metrics collection cycle.") - await collect_and_export_metrics() - logger.info(f"Waiting for {CHECK_INTERVAL} seconds before the next cycle.") - await asyncio.sleep(CHECK_INTERVAL) - except KeyboardInterrupt: - logger.info("Shutting down metrics collector due to KeyboardInterrupt.") - except Exception as e: - logger.error(f"Unexpected error: {e}") - -if __name__ == "__main__": - asyncio.run(monitor_and_export()) diff --git a/lxc_autoscale_ml/monitor/lxc_monitor.service b/lxc_autoscale_ml/monitor/lxc_monitor.service deleted file mode 100644 index e28d983..0000000 --- a/lxc_autoscale_ml/monitor/lxc_monitor.service +++ /dev/null @@ -1,19 +0,0 @@ -[Unit] -Description=LXC Monitor Service -Documentation=https://github.com/fabriziosalmi/proxmox-lxc-autoscale -After=network.target - -[Service] -ExecStart=/usr/bin/python3 /usr/local/bin/lxc_monitor.py -WorkingDirectory=/usr/local/bin/ -StandardOutput=inherit -StandardError=inherit -Restart=on-failure -User=root - -# Logging configuration -Environment="PYTHONUNBUFFERED=1" -EnvironmentFile=/etc/lxc_autoscale_ml/lxc_monitor.yaml - -[Install] -WantedBy=multi-user.target diff --git a/lxc_autoscale_ml/monitor/lxc_monitor.yaml b/lxc_autoscale_ml/monitor/lxc_monitor.yaml deleted file mode 100644 index ca1dcc9..0000000 --- a/lxc_autoscale_ml/monitor/lxc_monitor.yaml +++ /dev/null @@ -1,37 +0,0 @@ -logging: - # Path to the file where log messages are written. - log_file: "/var/log/lxc_monitor.log" - - # Maximum size of the log file before it is rotated (in bytes). Set to 5 MB here. - log_max_bytes: 5242880 # 5 MB - - # Number of backup log files to keep. Older files are deleted as new ones are created. - log_backup_count: 7 - - # Logging level to determine the verbosity of log messages. Options include "DEBUG", "INFO", "WARNING", "ERROR", and "CRITICAL". - log_level: "INFO" - -monitoring: - # Path to the file where metrics data is exported in JSON format. - export_file: "/var/log/lxc_metrics.json" - - # Interval (in seconds) at which the monitoring checks are performed. - check_interval: 60 # seconds - - # Flag to enable or disable swap memory monitoring. - enable_swap: true - - # Flag to enable or disable network statistics monitoring. - enable_network: true - - # Flag to enable or disable filesystem statistics monitoring. - enable_filesystem: true - - # Flag to enable or disable parallel processing of monitoring tasks. - parallel_processing: true # Toggle for parallel processing - - # Maximum number of parallel workers to use if parallel_processing is enabled. - max_workers: 8 # Max number of parallel workers if parallel_processing is enabled - - # List of device types to exclude from I/O statistics collection. - excluded_devices: ['loop', 'dm-'] # Devices to exclude in I/O stats diff --git a/lxc_autoscale_ml/tools/generate_metrics.py b/lxc_autoscale_ml/tools/generate_metrics.py deleted file mode 100644 index 8c9ed83..0000000 --- a/lxc_autoscale_ml/tools/generate_metrics.py +++ /dev/null @@ -1,197 +0,0 @@ -# This Python script generates fake metrics data for containers. -# It can simulate various metrics such as CPU usage, memory usage, network activity, etc., with options to introduce randomness and spikes in the data. -# The script supports generating data for multiple containers over a specified period and saving the output in JSON format. -# Example use: -# python3 generate_metrics.py --num-containers 5 --num-entries 100000 --interval-seconds 60 --randomness 1.0 --spike-likelihood 0.2 --spike-magnitude 1.0 --output-file fake_metrics.json - - -import json -import argparse -import datetime -import random - -def generate_metrics(container_id, base_time, interval_seconds, randomness_factor, spike_likelihood, spike_magnitude, base_metrics): - """ - Generate a single set of metrics for a container, with optional spikes. - - Args: - container_id (str): The ID of the container. - base_time (datetime): The base timestamp to start from. - interval_seconds (int): Time interval between each data point. - randomness_factor (float): Factor to introduce variability in metrics. - spike_likelihood (float): Probability of a spike occurring (0 to 1). - spike_magnitude (float): Magnitude of the spike when it occurs. - base_metrics (dict): Base metrics with initial values and variability ranges. - - Returns: - dict: A dictionary containing the generated metrics. - datetime: The updated base_time after the interval. - """ - # Base metrics with added randomness and optional spikes - cpu_usage_percent = base_metrics["cpu_usage_percent"] + (randomness_factor * base_metrics["cpu_variability"]) - memory_usage_mb = base_metrics["memory_usage_mb"] + (randomness_factor * base_metrics["memory_variability"]) - swap_usage_mb = base_metrics["swap_usage_mb"] - swap_total_mb = base_metrics["swap_total_mb"] - process_count = base_metrics["process_count"] - io_reads = base_metrics["io_reads"] + int(randomness_factor * base_metrics["io_variability"]) - io_writes = base_metrics["io_writes"] + int(randomness_factor * base_metrics["io_variability"]) - network_rx_bytes = base_metrics["network_rx_bytes"] + int(randomness_factor * base_metrics["network_variability"]) - network_tx_bytes = base_metrics["network_tx_bytes"] + int(randomness_factor * base_metrics["network_variability"]) - filesystem_usage_gb = base_metrics["filesystem_usage_gb"] - filesystem_total_gb = base_metrics["filesystem_total_gb"] - filesystem_free_gb = base_metrics["filesystem_free_gb"] - - # Apply spikes based on the spike likelihood - if random.random() < spike_likelihood: - cpu_usage_percent *= (1 + spike_magnitude) - memory_usage_mb *= (1 + spike_magnitude) - io_reads *= (1 + spike_magnitude) - io_writes *= (1 + spike_magnitude) - network_rx_bytes *= (1 + spike_magnitude) - network_tx_bytes *= (1 + spike_magnitude) - - timestamp = base_time + datetime.timedelta(seconds=interval_seconds) - base_time = timestamp - - metrics = { - container_id: { - "timestamp": timestamp.isoformat(), - "cpu_usage_percent": cpu_usage_percent, - "memory_usage_mb": memory_usage_mb, - "swap_usage_mb": swap_usage_mb, - "swap_total_mb": swap_total_mb, - "process_count": process_count, - "io_stats": { - "reads": io_reads, - "writes": io_writes - }, - "network_usage": { - "rx_bytes": network_rx_bytes, - "tx_bytes": network_tx_bytes - }, - "filesystem_usage_gb": filesystem_usage_gb, - "filesystem_total_gb": filesystem_total_gb, - "filesystem_free_gb": filesystem_free_gb - }, - "summary": { - "collection_start_time": (timestamp - datetime.timedelta(seconds=6)).isoformat(), - "collection_end_time": timestamp.isoformat(), - "total_containers": 1, - "total_duration_seconds": 6.0, - "monitor_cpu_percent": cpu_usage_percent, - "monitor_memory_usage_mb": memory_usage_mb - } - } - - return metrics, base_time - -def generate_dataset(container_ids, num_entries, interval_seconds, randomness_factor, spike_likelihood, spike_magnitude, base_metrics): - """ - Generate a dataset of fake metrics for multiple containers, with optional spikes. - - Args: - container_ids (list of str): List of container IDs. - num_entries (int): Number of metric entries to generate for each container. - interval_seconds (int): Time interval between each data point. - randomness_factor (float): Factor to introduce variability in metrics. - spike_likelihood (float): Probability of a spike occurring (0 to 1). - spike_magnitude (float): Magnitude of the spike when it occurs. - base_metrics (dict): Base metrics with initial values and variability ranges. - - Returns: - list of dict: The generated dataset. - """ - base_time = datetime.datetime.now() - dataset = [] - - for _ in range(num_entries): - for container_id in container_ids: - metrics, base_time = generate_metrics( - container_id, base_time, interval_seconds, randomness_factor, spike_likelihood, spike_magnitude, base_metrics - ) - dataset.append(metrics) - - return dataset - -def main(): - """ - Main function to parse arguments and generate the fake metrics dataset. - """ - parser = argparse.ArgumentParser(description="Generate fake metrics data with optional spikes.") - parser.add_argument( - '--container-id', nargs='*', type=str, - help="Container ID(s). If not provided, IDs from 100 to 999 will be used." - ) - parser.add_argument( - '--num-containers', type=int, default=1, - help="Number of container IDs to generate (only used if --container-id is not provided)" - ) - parser.add_argument( - '--num-entries', type=int, default=10, - help="Number of metric entries to generate for each container" - ) - parser.add_argument( - '--interval-seconds', type=int, default=60, - help="Interval between data points in seconds" - ) - parser.add_argument( - '--randomness', type=float, default=1.0, - help="Randomness factor for metric generation" - ) - parser.add_argument( - '--spike-likelihood', type=float, default=0.1, - help="Likelihood of a spike occurring (0 to 1)" - ) - parser.add_argument( - '--spike-magnitude', type=float, default=0.5, - help="Magnitude of the spike (as a percentage increase)" - ) - parser.add_argument( - '--output-file', type=str, default="fake_metrics.json", - help="Output file to save the generated dataset" - ) - - # Default base metrics with variability ranges - base_metrics = { - "cpu_usage_percent": 5.0, - "cpu_variability": 0.5, - "memory_usage_mb": 96.0, - "memory_variability": 0.8, - "swap_usage_mb": 94.3359375, - "swap_total_mb": 512.0, - "process_count": 27, - "io_reads": 19500000, - "io_writes": 6900000, - "io_variability": 50000, - "network_rx_bytes": 950000, - "network_tx_bytes": 57000, - "network_variability": 10000, - "filesystem_usage_gb": 1.5947265625, - "filesystem_total_gb": 18.5341796875, - "filesystem_free_gb": 16.0048828125 - } - - args = parser.parse_args() - - if args.container_id: - container_ids = args.container_id - else: - start_id = 100 - end_id = start_id + args.num_containers - end_id = min(end_id, 1000) - container_ids = [str(i) for i in range(start_id, end_id)] - - dataset = generate_dataset( - container_ids, args.num_entries, args.interval_seconds, args.randomness, - args.spike_likelihood, args.spike_magnitude, base_metrics - ) - - with open(args.output_file, 'w', encoding='utf-8') as output_file: - json.dump(dataset, output_file, indent=4) - - print(f"Generated {args.num_entries} entries of fake metrics data for " - f"{len(container_ids)} containers with {args.interval_seconds}s intervals " - f"and saved to {args.output_file}") - -if __name__ == "__main__": - main() diff --git a/lxc_autoscale_ml/uninstall.sh b/lxc_autoscale_ml/uninstall.sh deleted file mode 100644 index 9233592..0000000 --- a/lxc_autoscale_ml/uninstall.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/bin/bash - -# Log file for uninstallation -LOGFILE="lxc_autoscale_uninstaller.log" - -# Define text styles and emojis -BOLD=$(tput bold) -RESET=$(tput sgr0) -GREEN=$(tput setaf 2) -RED=$(tput setaf 1) -CHECKMARK="\xE2\x9C\x85" # โœ”๏ธ -CROSSMARK="\xE2\x9D\x8C" # โŒ - -# Log function -log() { - local level="$1" - local message="$2" - local timestamp - timestamp=$(date +"%Y-%m-%d %H:%M:%S") - case $level in - "INFO") - echo -e "${timestamp} [${GREEN}${level}${RESET}] ${message}" | tee -a "$LOGFILE" - ;; - "ERROR") - echo -e "${timestamp} [${RED}${level}${RESET}] ${message}" | tee -a "$LOGFILE" - ;; - esac -} - -log "INFO" "Starting LXC AutoScale uninstallation..." - -# Function to stop and disable a service -uninstall_service() { - local service_name="$1" - log "INFO" "Stopping and disabling the $service_name service..." - if systemctl stop "$service_name" && systemctl disable "$service_name"; then - log "INFO" "${CHECKMARK} Successfully stopped and disabled $service_name." - else - log "ERROR" "${CROSSMARK} Failed to stop or disable $service_name, or it was not found." - fi -} - -# Function to remove files and directories -remove_files() { - local files=("$@") - for file in "${files[@]}"; do - log "INFO" "Removing $file..." - if rm -rf "$file"; then - log "INFO" "${CHECKMARK} Successfully removed $file." - else - log "ERROR" "${CROSSMARK} Failed to remove $file, or it was not found." - fi - done -} - -# Uninstall LXC AutoScale API -log "INFO" "Uninstalling LXC AutoScale API..." -uninstall_service "lxc_autoscale_api.service" -remove_files "/usr/local/bin/lxc_autoscale_api" "/etc/systemd/system/lxc_autoscale_api.service" "/etc/lxc_autoscale/lxc_autoscale_api.yaml" - -# Uninstall LXC AutoScale ML -log "INFO" "Uninstalling LXC AutoScale ML..." -uninstall_service "lxc_autoscale_ml.service" -remove_files "/usr/local/bin/lxc_autoscale_ml.py" "/etc/systemd/system/lxc_autoscale_ml.service" "/etc/lxc_autoscale/lxc_autoscale_ml.yaml" - -# Uninstall LXC Monitor -log "INFO" "Uninstalling LXC Monitor..." -uninstall_service "lxc_monitor.service" -remove_files "/usr/local/bin/lxc_monitor.py" "/etc/systemd/system/lxc_monitor.service" "/etc/lxc_autoscale/lxc_monitor.yaml" - -# Cleanup shared configuration directory -log "INFO" "Cleaning up shared configuration directory if empty..." -if [ -d "/etc/lxc_autoscale" ] && [ ! "$(ls -A /etc/lxc_autoscale)" ]; then - rmdir /etc/lxc_autoscale - log "INFO" "${CHECKMARK} Successfully removed empty directory /etc/lxc_autoscale." -fi - -# Final cleanup -log "INFO" "Reloading systemd daemon to reflect changes..." -systemctl daemon-reload - -log "INFO" "LXC AutoScale uninstallation complete!"