diff --git a/lxc_autoscale_ml/api/README.md b/lxc_autoscale_ml/api/README.md
deleted file mode 100644
index e63cbe0..0000000
--- a/lxc_autoscale_ml/api/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# LXC AutoScale API
-
-- [Documentation](https://github.com/fabriziosalmi/proxmox-lxc-autoscale/blob/main/docs/lxc_autoscale_api/README.md)
diff --git a/lxc_autoscale_ml/api/cloning_management.py b/lxc_autoscale_ml/api/cloning_management.py
deleted file mode 100644
index d943091..0000000
--- a/lxc_autoscale_ml/api/cloning_management.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from lxc_management import LXCManager
-from utils import create_response, handle_error
-
-def create_clone(vm_id, new_vm_id, new_vm_name):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.clone(vm_id, new_vm_id, new_vm_name)
- return create_response(data=result, message=f"Container {vm_id} cloned as '{new_vm_name}' with ID {new_vm_id}")
- except Exception as e:
- return handle_error(e)
-
-def delete_clone(vm_id):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.delete_container(vm_id)
- return create_response(data=result, message=f"Container {vm_id} successfully deleted")
- except Exception as e:
- return handle_error(e)
diff --git a/lxc_autoscale_ml/api/config.py b/lxc_autoscale_ml/api/config.py
deleted file mode 100644
index 902bdb7..0000000
--- a/lxc_autoscale_ml/api/config.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-import yaml
-from flask import Flask
-
-def load_config(config_file='/etc/lxc_autoscale/lxc_autoscale_api.yaml'):
- with open(config_file, 'r') as file:
- config = yaml.safe_load(file)
- return config
-
-def create_app(config=None):
- app = Flask(__name__)
-
- if config is None:
- config = load_config()
-
- app.config['LXC_NODE'] = config['lxc']['node']
- app.config['DEFAULT_STORAGE'] = config['lxc']['default_storage']
- app.config['TIMEOUT'] = config['lxc']['timeout_seconds']
-
- # Load the rate limiting configuration
- app.config['RATE_LIMITING'] = config.get('rate_limiting', {})
-
- # Flask settings
- app.secret_key = os.urandom(24)
- app.config['DEBUG'] = False
- app.config['LOGGING'] = config['logging']
- app.config['ERROR_HANDLING'] = config['error_handling']
-
- return app
diff --git a/lxc_autoscale_ml/api/error_handling.py b/lxc_autoscale_ml/api/error_handling.py
deleted file mode 100644
index 2f04602..0000000
--- a/lxc_autoscale_ml/api/error_handling.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from flask import jsonify, current_app
-import logging
-
-# Centralized error handler
-def handle_error(exception, status_code=500):
- error_handling_config = current_app.config.get('ERROR_HANDLING', {})
-
- # Log the error if logging is enabled
- if error_handling_config.get('log_errors', True):
- logging.error(f"Error occurred: {str(exception)}")
-
- # Optionally notify on critical errors
- if status_code >= 500 and error_handling_config.get('notify_on_critical_errors', False):
- notify_on_critical_error(exception)
-
- # Optionally show stack traces (in non-production environments)
- if error_handling_config.get('show_stack_traces', False):
- response = {
- "status": "error",
- "message": str(exception),
- "stack_trace": repr(exception)
- }
- else:
- response = {
- "status": "error",
- "message": "An internal error occurred. Please contact support if the issue persists."
- }
-
- return jsonify(response), status_code
-
-# Function to notify admins of critical errors (stub function)
-def notify_on_critical_error(exception):
- error_handling_config = current_app.config['ERROR_HANDLING']
- recipients = error_handling_config.get('notification_recipients', [])
-
- # Log the notification action
- logging.info(f"Notifying recipients of critical error: {', '.join(recipients)}")
-
- # Implement actual notification logic (e.g., send an email or post to a Slack channel)
- # This is a placeholder for demonstration purposes
- for recipient in recipients:
- logging.info(f"Notified {recipient} of the error: {str(exception)}")
diff --git a/lxc_autoscale_ml/api/health_check.py b/lxc_autoscale_ml/api/health_check.py
deleted file mode 100644
index 6f41803..0000000
--- a/lxc_autoscale_ml/api/health_check.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from flask import jsonify
-
-def health_check():
- try:
- # Perform health checks here, e.g., checking database connectivity, etc.
- # For now, we just return a simple healthy status.
- status = {
- "status": "healthy",
- "checks": {
- "database": "connected", # Replace with actual database check
- "lxc_commands": "available" # Replace with actual check
- }
- }
- return jsonify(status), 200
- except Exception as e:
- return jsonify({"status": "unhealthy", "error": str(e)}), 500
diff --git a/lxc_autoscale_ml/api/lxc_autoscale_api.py b/lxc_autoscale_ml/api/lxc_autoscale_api.py
deleted file mode 100644
index 82ed554..0000000
--- a/lxc_autoscale_ml/api/lxc_autoscale_api.py
+++ /dev/null
@@ -1,297 +0,0 @@
-import logging
-from flask import Flask, jsonify, request
-from config import create_app
-from scaling import scale_cpu, scale_ram, resize_storage
-from snapshot_management import create_snapshot, list_snapshots, rollback_snapshot
-from cloning_management import create_clone, delete_clone
-from resource_checking import check_vm_status, check_node_status, check_cluster_status
-from health_check import health_check
-from rate_limiting import rate_limit
-from error_handling import handle_error
-from lxc_management import LXCManager
-from utils import create_response
-
-# Setup logging
-log_file = "/var/log/lxc_autoscale_api.log"
-logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s - %(message)s",
-)
-
-app = create_app()
-
-# Home route
-
-@app.route('/', methods=['GET'])
-def home():
- documentation = """
-
-
-
-
-
- AutoScaleAPI Documentation
-
-
-
-
-
-
-
-
AutoScaleAPI Documentation
-
Welcome to the AutoScaleAPI. Below is a list of available API routes with descriptions and example usage.
-
-
-
-
- Endpoint |
- Methods |
- Description |
- Example |
-
-
-
-
- /scale/cores |
- POST |
- Set the exact number of CPU cores for an LXC container. |
- curl -X POST http://proxmox:5000/scale/cores -H "Content-Type: application/json" -d '{"vm_id": 104, "cores": 4}' |
-
-
- /scale/ram |
- POST |
- Set the exact amount of RAM for an LXC container. |
- curl -X POST http://proxmox:5000/scale/ram -H "Content-Type: application/json" -d '{"vm_id": 104, "memory": 4096}' |
-
-
- /scale/storage/increase |
- POST |
- Increase the storage size of an LXC container's root filesystem. |
- curl -X POST http://proxmox:5000/scale/storage/increase -H "Content-Type: application/json" -d '{"vm_id": 104, "disk_size": 2}' |
-
-
- /snapshot/create |
- POST |
- Create a snapshot for an LXC container. |
- curl -X POST http://proxmox:5000/snapshot/create -H "Content-Type: application/json" -d '{"vm_id": 104, "snapshot_name": "my_snapshot"}' |
-
-
- /snapshot/list |
- GET |
- List all snapshots for an LXC container. |
- curl -X GET "http://proxmox:5000/snapshot/list?vm_id=104" |
-
-
- /snapshot/rollback |
- POST |
- Rollback to a specific snapshot. |
- curl -X POST http://proxmox:5000/snapshot/rollback -H "Content-Type: application/json" -d '{"vm_id": 104, "snapshot_name": "my_snapshot"}' |
-
-
- /clone/create |
- POST |
- Clone an LXC container. |
- curl -X POST http://proxmox:5000/clone/create -H "Content-Type: application/json" -d '{"vm_id": 104, "new_vm_id": 105, "new_vm_name": "cloned_container"}' |
-
-
- /clone/delete |
- DELETE |
- Delete a cloned LXC container. |
- curl -X DELETE http://proxmox:5000/clone/delete -H "Content-Type: application/json" -d '{"vm_id": 105}' |
-
-
- /resource/vm/status |
- GET |
- Check the resource allocation and usage for an LXC container. |
- curl -X GET "http://proxmox:5000/resource/vm/status?vm_id=104" |
-
-
- /resource/node/status |
- GET |
- Check the resource usage of a specific node. |
- curl -X GET "http://proxmox:5000/resource/node/status?node_name=proxmox4" |
-
-
- /health/check |
- GET |
- Perform a health check on the API server. |
- curl -X GET http://proxmox:5000/health/check |
-
-
- /routes |
- GET |
- List all available routes. |
- curl -X GET http://proxmox:5000/routes |
-
-
-
-
-
-
-
-
-
-
- """.replace("", "proxmox")
-
- return documentation
-
-
-# Define routes
-
-@app.route('/scale/cores', methods=['POST'])
-@rate_limit
-def set_cores():
- data = request.json
- vm_id = data['vm_id']
- cores = data['cores']
- logging.info(f"Setting {cores} cores for VM {vm_id}")
- return scale_cpu(vm_id, cores)
-
-@app.route('/scale/ram', methods=['POST'])
-@rate_limit
-def set_ram():
- data = request.json
- vm_id = data['vm_id']
- memory = data['memory']
- logging.info(f"Setting {memory} MB RAM for VM {vm_id}")
- return scale_ram(vm_id, memory)
-
-@app.route('/scale/storage/increase', methods=['POST'])
-@rate_limit
-def increase_storage():
- data = request.json
- vm_id = data['vm_id']
- disk_size = data['disk_size']
- logging.info(f"Increasing storage by {disk_size} GB for VM {vm_id}")
- return resize_storage(vm_id, disk_size)
-
-@app.route('/snapshot/create', methods=['POST'])
-@rate_limit
-def create_snapshot_route():
- data = request.json
- vm_id = data['vm_id']
- snapshot_name = data['snapshot_name']
- logging.info(f"Creating snapshot '{snapshot_name}' for VM {vm_id}")
- return create_snapshot(vm_id, snapshot_name)
-
-@app.route('/snapshot/list', methods=['GET'])
-@rate_limit
-def list_snapshots_route():
- vm_id = request.args.get('vm_id')
- return list_snapshots(vm_id)
-
-@app.route('/snapshot/rollback', methods=['POST'])
-@rate_limit
-def rollback_snapshot_route():
- data = request.json
- vm_id = data['vm_id']
- snapshot_name = data['snapshot_name']
- return rollback_snapshot(vm_id, snapshot_name)
-
-@app.route('/clone/create', methods=['POST'])
-@rate_limit
-def create_clone():
- data = request.json
- vm_id = data['vm_id']
- new_vm_id = data['new_vm_id']
- new_vm_name = data['new_vm_name'].replace('_', '-') # Replace underscores with hyphens
- snapshot_name = f"snapshot-{new_vm_id}" # Create a unique snapshot name
-
- try:
- lxc_manager = LXCManager()
-
- # Step 1: Create a snapshot
- lxc_manager.create_snapshot(vm_id, snapshot_name)
- logging.info(f"Creating snapshot of VM {vm_id} with name '{snapshot_name}'")
-
- # Step 2: Clone the container from the snapshot
- lxc_manager.clone_container(vm_id, new_vm_id, new_vm_name, snapshot_name)
- logging.info(f"Cloning VM {vm_id} to new VM {new_vm_id} with name '{new_vm_name}'")
-
- # Step 3: Start the cloned container
- lxc_manager.start_container(new_vm_id)
- logging.info(f"Starting clone VM {new_vm_id} with name '{new_vm_name}'")
-
- # Step 4: Delete the snapshot
- lxc_manager.delete_snapshot(vm_id, snapshot_name)
- logging.info(f"Deleting snapshot of VM {vm_id} with name '{snapshot_name}'")
-
- return create_response(
- data=f"Container {new_vm_id} cloned from {vm_id} and started successfully. Snapshot {snapshot_name} removed.",
- message="Clone operation completed successfully.",
- status_code=200
- )
- except Exception as e:
- return handle_error(e)
-
-
-@app.route('/clone/delete', methods=['DELETE'])
-@rate_limit
-def delete_clone_route():
- vm_id = request.json['vm_id']
-
- try:
- lxc_manager = LXCManager()
-
- # Step 1: Stop the container
- lxc_manager.stop_container(vm_id)
- logging.info(f"Stopping VM {new_vm_id}")
-
- # Step 2: Destroy the container
- lxc_manager.destroy_container(vm_id)
- logging.info(f"Destroying VM {new_vm_id}")
-
- return create_response(
- data=f"Container {vm_id} stopped and destroyed successfully.",
- message="Delete operation completed successfully.",
- status_code=200
- )
- except Exception as e:
- return handle_error(e)
-
-
-@app.route('/resource/vm/status', methods=['GET'])
-@rate_limit
-def check_vm_status_route():
- vm_id = request.args.get('vm_id')
- logging.info(f"Checking status for VM {vm_id}")
- return check_vm_status(vm_id)
-
-@app.route('/resource/node/status', methods=['GET'])
-@rate_limit
-def check_node_status_route():
- node_name = request.args.get('node_name')
- logging.info(f"Checking status for node {node_name}")
- return check_node_status(node_name)
-
-@app.route('/health/check', methods=['GET'])
-def health_check_route():
- logging.info("Health check endpoint accessed")
- return health_check()
-
-@app.route('/routes', methods=['GET'])
-def list_routes():
- routes = []
- for rule in app.url_map.iter_rules():
- methods = ','.join(rule.methods)
- route = {
- "endpoint": rule.endpoint,
- "methods": methods,
- "url": str(rule)
- }
- routes.append(route)
- logging.info("Routes list endpoint accessed")
- return jsonify(routes), 200
-
-
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-
-if __name__ == "__main__":
- app.run(host="0.0.0.0", port=5000)
diff --git a/lxc_autoscale_ml/api/lxc_autoscale_api.service b/lxc_autoscale_ml/api/lxc_autoscale_api.service
deleted file mode 100644
index b949267..0000000
--- a/lxc_autoscale_ml/api/lxc_autoscale_api.service
+++ /dev/null
@@ -1,18 +0,0 @@
-[Unit]
-Description=LXC AutoScale API
-Documentation=https://github.com/fabriziosalmi/proxmox-lxc-autoscale
-After=network.target
-
-[Service]
-User=root
-Group=root
-Environment="PATH=/usr/local/bin:/usr/bin:/bin:/usr/sbin"
-WorkingDirectory=/usr/local/bin/lxc_autoscale_api/
-ExecStart=/usr/bin/python3 /usr/local/bin/lxc_autoscale_api/lxc_autoscale_api.py
-
-# Restart on failure
-Restart=on-failure
-RestartSec=5
-
-[Install]
-WantedBy=multi-user.target
diff --git a/lxc_autoscale_ml/api/lxc_autoscale_api.yaml b/lxc_autoscale_ml/api/lxc_autoscale_api.yaml
deleted file mode 100644
index a80799b..0000000
--- a/lxc_autoscale_ml/api/lxc_autoscale_api.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-lxc:
- # Name or IP address of the Proxmox node where LXC containers are managed.
- node: "proxmox" # Replace with your Proxmox node name or IP. Note: if node name or hostname won't work use IP address instead, should work as expected.
-
- # Whether to verify SSL certificates. Set to false for local commands.
- verify_ssl: false # Not applicable since we are using local commands
-
- # Default storage location for LXC containers.
- default_storage: "local-lvm" # Default storage location
-
- # Timeout for operations in seconds.
- timeout_seconds: 10
-
- # Maximum number of retry attempts for operations.
- max_retries: 3
-
-rate_limiting:
- # Maximum number of requests allowed per minute to prevent abuse.
- max_requests_per_minute: 60
-
-logging:
- # Logging level that controls the verbosity of logs. Options include "DEBUG", "INFO", "WARNING", "ERROR", and "CRITICAL".
- level: "INFO"
-
- # Whether to enable log rotation to manage log file size.
- rotate: true
-
- # Maximum size of the log file in megabytes before it is rotated.
- max_log_size_mb: 100
-
- # Number of backup log files to keep. Older files are deleted when new ones are created.
- backup_count: 5
-
- # Whether to use structured logging (e.g., JSON format).
- structured_logging: false
-
- # Format for log messages. Uses Python's logging format string.
- log_format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" # Log format
-
- # Path to the file where access logs are written.
- access_log_file: "access.log" # Separate access log for HTTP requests
-
- # Path to the file where error logs are written.
- error_log_file: "error.log" # Separate error log for exceptions and errors
-
-error_handling:
- # Whether to display stack traces in error messages.
- show_stack_traces: false
-
- # Whether to log errors to the error log file.
- log_errors: true
-
- # Whether to send notifications for critical errors.
- notify_on_critical_errors: false
-
- # List of email addresses to notify in case of critical errors.
- notification_recipients:
- # Email address to receive notifications.
- - "your-email@inbox.com"
-
-gunicorn:
- # Number of worker processes to handle incoming requests.
- workers: 2
-
- # Timeout for worker processes in seconds. Defines how long a worker can handle a request before being killed.
- timeout_seconds: 120
-
- # Log level for Gunicorn, which affects the verbosity of logs.
- log_level: "info"
-
- # Path to the file where Gunicorn access logs are written.
- access_log_file: "/var/log/lxc_autoscale_api_access.log"
-
- # Path to the file where Gunicorn error logs are written.
- error_log_file: "/var/log/lxc_autoscale_api_error.log"
-
- # Whether to preload the application before forking worker processes. Reduces startup time but uses more memory.
- preload_app: true
-
- # Time to wait before forcefully killing workers during a graceful restart.
- graceful_timeout_seconds: 30
-
- # Number of requests a worker will handle before being restarted. Helps to prevent memory leaks.
- max_requests: 500
-
- # Adds a random jitter to `max_requests` to avoid all workers restarting at the same time.
- max_requests_jitter: 50
diff --git a/lxc_autoscale_ml/api/lxc_management.py b/lxc_autoscale_ml/api/lxc_management.py
deleted file mode 100644
index 0549148..0000000
--- a/lxc_autoscale_ml/api/lxc_management.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import subprocess
-import shlex
-from flask import current_app
-import logging
-
-class LXCManager:
- def __init__(self, node=None):
- self.node = node or current_app.config['LXC_NODE']
- self.timeout = current_app.config['TIMEOUT']
-
- def _run_command(self, command):
- try:
- logging.info(f"Running command: {command}")
- result = subprocess.run(
- shlex.split(command),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- timeout=self.timeout,
- universal_newlines=True
- )
- result.check_returncode()
- return result.stdout.strip()
- except subprocess.CalledProcessError as e:
- logging.error(f"Command failed: {e.stderr}")
- raise Exception(f"Command failed: {e.stderr}")
- except subprocess.TimeoutExpired:
- logging.error("Command timed out")
- raise Exception("Command timed out")
-
-
- def stop_container(self, vm_id):
- command = f"pct stop {vm_id}"
- return self._run_command(command)
-
- def destroy_container(self, vm_id):
- command = f"pct destroy {vm_id}"
- return self._run_command(command)
-
- def create_temporary_snapshot(self, vm_id):
- snapshot_name = f"migrate-snapshot-{vm_id}"
- command = f"pct snapshot {vm_id} {snapshot_name}"
- self._run_command(command)
- return snapshot_name
-
- def delete_snapshot(self, vm_id, snapshot_name):
- command = f"pct delsnapshot {vm_id} {snapshot_name}"
- self._run_command(command)
-
- def migrate_container(self, vm_id, target_node):
- # Check for non-migratable snapshots or create a new one for migration
- snapshot_name = self.create_temporary_snapshot(vm_id)
-
- try:
- command = f"pct migrate {vm_id} {target_node}"
- self._run_command(command)
- finally:
- # Clean up the temporary snapshot after migration
- self.delete_snapshot(vm_id, snapshot_name)
-
- def scale_cpu(self, vm_id, cores):
- command = f"pct set {vm_id} -cores {cores}"
- return self._run_command(command)
-
- def scale_ram(self, vm_id, memory):
- command = f"pct set {vm_id} -memory {memory}"
- return self._run_command(command)
-
- def get_current_disk_size(self, vm_id):
- # Retrieve the current size of the root filesystem in GB
- command = f"pct config {vm_id}"
- output = self._run_command(command)
- for line in output.splitlines():
- if line.startswith("rootfs:"):
- current_size = line.split(",")[1].replace("size=", "").replace("G", "").strip()
- return int(current_size)
- raise Exception("Failed to retrieve current disk size")
-
- def resize_storage(self, vm_id, disk_size):
- current_size = self.get_current_disk_size(vm_id)
- new_size = current_size + disk_size
- command = f"pct resize {vm_id} rootfs {new_size}G"
- return self._run_command(command)
-
- def create_snapshot(self, vm_id, snapshot_name):
- command = f"pct snapshot {vm_id} {snapshot_name}"
- return self._run_command(command)
-
- def clone_container(self, vm_id, new_vm_id, new_vm_name, snapshot_name):
- command = f"pct clone {vm_id} {new_vm_id} --hostname {new_vm_name} --snapname {snapshot_name}"
- return self._run_command(command)
-
- def start_container(self, vm_id):
- command = f"pct start {vm_id}"
- return self._run_command(command)
-
- def list_snapshots(self, vm_id):
- command = f"pct listsnapshot {vm_id}"
- return self._run_command(command)
-
- def rollback_snapshot(self, vm_id, snapshot_name):
- command = f"pct rollback {vm_id} {snapshot_name}"
- return self._run_command(command)
-
- def clone(self, vm_id, new_vm_id, new_vm_name):
- command = f"pct clone {vm_id} {new_vm_id} --hostname {new_vm_name} --full"
- return self._run_command(command)
-
- def delete_container(self, vm_id):
- command = f"pct stop {vm_id} && pct destroy {vm_id}"
- return self._run_command(command)
-
- def migrate(self, vm_id, target_node):
- command = f"pct migrate {vm_id} {target_node}"
- return self._run_command(command)
diff --git a/lxc_autoscale_ml/api/rate_limiting.py b/lxc_autoscale_ml/api/rate_limiting.py
deleted file mode 100644
index c8c6062..0000000
--- a/lxc_autoscale_ml/api/rate_limiting.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from functools import wraps
-from flask import request, jsonify, current_app
-import time
-
-# Simple in-memory rate limiting for demonstration purposes
-rate_limit_data = {}
-
-def rate_limit(f):
- @wraps(f)
- def decorated_function(*args, **kwargs):
- client_ip = request.remote_addr
- rate_limiting_config = current_app.config['RATE_LIMITING']
-
- if client_ip not in rate_limit_data:
- rate_limit_data[client_ip] = []
-
- access_times = rate_limit_data[client_ip]
- current_time = time.time()
-
- # Filter out access times that are older than one minute
- access_times = [t for t in access_times if current_time - t < 60]
- rate_limit_data[client_ip] = access_times
-
- if len(access_times) >= rate_limiting_config['max_requests_per_minute']:
- return jsonify({"error": "Rate limit exceeded. Please try again later."}), 429
-
- access_times.append(current_time)
- return f(*args, **kwargs)
-
- return decorated_function
diff --git a/lxc_autoscale_ml/api/resource_checking.py b/lxc_autoscale_ml/api/resource_checking.py
deleted file mode 100644
index a5650b1..0000000
--- a/lxc_autoscale_ml/api/resource_checking.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import subprocess
-from utils import create_response, handle_error
-
-def check_vm_status(vm_id):
- try:
- result = subprocess.run(f"pct status {vm_id}", shell=True, capture_output=True, text=True)
- return create_response(data=result.stdout.strip(), message=f"Resource status retrieved for container {vm_id}")
- except Exception as e:
- return handle_error(e)
-
-def check_node_status(node_name):
- try:
- result = subprocess.run(f"pvesh get /nodes/{node_name}/status", shell=True, capture_output=True, text=True)
- return create_response(data=result.stdout.strip(), message=f"Resource status retrieved for node '{node_name}'")
- except Exception as e:
- return handle_error(e)
-
-def check_cluster_status():
- try:
- result = subprocess.run("pvecm status", shell=True, capture_output=True, text=True)
- return create_response(data=result.stdout.strip(), message="Cluster resource status retrieved successfully")
- except Exception as e:
- return handle_error(e)
diff --git a/lxc_autoscale_ml/api/scaling.py b/lxc_autoscale_ml/api/scaling.py
deleted file mode 100644
index b0b1c39..0000000
--- a/lxc_autoscale_ml/api/scaling.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from lxc_management import LXCManager
-from utils import create_response, handle_error
-
-def scale_cpu(vm_id, cores):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.scale_cpu(vm_id, cores)
- return create_response(data=result, message=f"CPU cores set to {cores} for container {vm_id}")
- except Exception as e:
- return handle_error(e)
-
-def scale_ram(vm_id, memory):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.scale_ram(vm_id, memory)
- return create_response(data=result, message=f"RAM set to {memory} MB for container {vm_id}")
- except Exception as e:
- return handle_error(e)
-
-def resize_storage(vm_id, disk_size):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.resize_storage(vm_id, disk_size)
- return create_response(data=result, message=f"Storage increased by {disk_size} GB for container {vm_id}")
- except Exception as e:
- return handle_error(e)
diff --git a/lxc_autoscale_ml/api/snapshot_management.py b/lxc_autoscale_ml/api/snapshot_management.py
deleted file mode 100644
index 0a96263..0000000
--- a/lxc_autoscale_ml/api/snapshot_management.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from lxc_management import LXCManager
-from utils import create_response, handle_error
-
-def create_snapshot(vm_id, snapshot_name):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.create_snapshot(vm_id, snapshot_name)
- return create_response(data=result, message=f"Snapshot '{snapshot_name}' created for container {vm_id}")
- except Exception as e:
- return handle_error(e)
-
-def list_snapshots(vm_id):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.list_snapshots(vm_id)
- return create_response(data=result, message=f"Snapshots listed for container {vm_id}")
- except Exception as e:
- return handle_error(e)
-
-def rollback_snapshot(vm_id, snapshot_name):
- try:
- lxc_manager = LXCManager()
- result = lxc_manager.rollback_snapshot(vm_id, snapshot_name)
- return create_response(data=result, message=f"Container {vm_id} rolled back to snapshot '{snapshot_name}'")
- except Exception as e:
- return handle_error(e)
diff --git a/lxc_autoscale_ml/api/utils.py b/lxc_autoscale_ml/api/utils.py
deleted file mode 100644
index 53e647e..0000000
--- a/lxc_autoscale_ml/api/utils.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from flask import jsonify, current_app
-import logging
-
-def create_response(data=None, message=None, status_code=200):
- response = {
- 'status': 'success' if status_code < 400 else 'error',
- 'message': message,
- 'data': data
- }
- return jsonify(response), status_code
-
-def handle_error(exception, status_code=500):
- error_handling_config = current_app.config.get('ERROR_HANDLING', {})
- if error_handling_config.get('log_errors', True):
- logging.error(f"Error occurred: {str(exception)}")
-
- response = {
- "status": "error",
- "message": str(exception) if error_handling_config.get('show_stack_traces', False) else "An internal error occurred."
- }
- return jsonify(response), status_code
diff --git a/lxc_autoscale_ml/install_api.sh b/lxc_autoscale_ml/install_api.sh
deleted file mode 100644
index c621a0f..0000000
--- a/lxc_autoscale_ml/install_api.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-# Ensure the script is run as root
-if [ "$(id -u)" -ne 0 ]; then
- echo "๐ซ This script must be run as root. Please use sudo or run as root."
- exit 1
-fi
-
-# Step 1: Create the necessary directory
-echo "๐ Creating directory /usr/local/bin/lxc_autoscale_api..."
-mkdir -p /usr/local/bin/lxc_autoscale_api
-mkdir -p /etc/lxc_autoscale
-
-# Flask testing server (gunicorn setup will be release later on)
-
-# Step 2: Install necessary packages
-echo "๐ฆ Installing required packages..."
-apt update
-apt install git python3-flask python3-requests -y
-
-# Step 3: Clone the repository
-echo "๐ Cloning the repository..."
-git clone https://github.com/fabriziosalmi/proxmox-lxc-autoscale
-
-# Step 4: Copy service file to systemd
-echo "๐ Copying service file to systemd directory..."
-cp proxmox-lxc-autoscale/lxc_autoscale_ml/api/lxc_autoscale_api.service /etc/systemd/system/lxc_autoscale_api.service
-
-# Step 5: Reload systemd daemon
-echo "๐ Reloading systemd daemon..."
-systemctl daemon-reload
-
-# Step 6: Copy the necessary files to the appropriate directories
-echo "๐ Copying Python scripts and configuration files..."
-cp proxmox-lxc-autoscale/lxc_autoscale_ml/api/*.py /usr/local/bin/lxc_autoscale_api/
-cp proxmox-lxc-autoscale/lxc_autoscale_ml/api/config.yaml /etc/lxc_autoscale/lxc_autoscale_api.yaml
-
-# Step 7: Enable and start the service
-echo "๐ Enabling and starting the autoscaleapi service..."
-systemctl enable lxc_autoscale_api.service
-systemctl start lxc_autoscale_api.service
-systemctl status lxc_autoscale_api.service
-
-# Step 10: Clean up the cloned repository
-echo "๐งน Cleaning up..."
-rm -rf proxmox-lxc-autoscale
-
-echo "โ
Installation complete. The LXC AutoScale API service is now running. ๐"
diff --git a/lxc_autoscale_ml/install_model.sh b/lxc_autoscale_ml/install_model.sh
deleted file mode 100644
index a80ea9f..0000000
--- a/lxc_autoscale_ml/install_model.sh
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/bin/bash
-
-# Variables
-
-# Monitor
-REPO_BASE_URL="https://raw.githubusercontent.com/fabriziosalmi/proxmox-lxc-autoscale/main"
-SCRIPT_URL="${REPO_BASE_URL}/lxc_autoscale_ml/model/lxc_autoscale_ml.py"
-SERVICE_URL="${REPO_BASE_URL}/lxc_autoscale_ml/model/lxc_autoscale_ml.service"
-CONF_URL="${REPO_BASE_URL}/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml"
-
-INSTALL_PATH="/usr/local/bin/lxc_autoscale_ml.py"
-SERVICE_PATH="/etc/systemd/system/lxc_autoscale_ml.service"
-CONF_DIR="/etc/lxc_autoscale"
-YAML_CONF_PATH="${CONF_DIR}/lxc_autoscale_ml.yaml"
-
-# Function to check and stop the service if running
-stop_service_if_running() {
- if systemctl is-active --quiet lxc_autoscale_ml.service; then
- echo "๐ Stopping LXC AutoScale ML service..."
- systemctl stop lxc_autoscale_ml.service
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to stop the service."
- exit 1
- fi
- fi
-}
-
-# Function to start the service
-start_service() {
- echo "๐ Starting the LXC AutoScale ML service..."
- systemctl start lxc_autoscale_ml.service
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to start the service."
- exit 1
- fi
-}
-
-# Function to enable the service
-enable_service() {
- echo "๐ง Enabling the LXC AutoScale ML service..."
- systemctl enable lxc_autoscale_ml.service
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to enable the service."
- exit 1
- fi
-}
-
-# Function to backup existing configuration file
-backup_existing_conf() {
- if [ -f "$YAML_CONF_PATH" ]; then
- timestamp=$(date +"%Y%m%d-%H%M%S")
- backup_conf="${YAML_CONF_PATH}.${timestamp}.backup"
- echo "๐พ Backing up existing configuration file to $backup_conf..."
- cp "$YAML_CONF_PATH" "$backup_conf"
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to backup the existing configuration file."
- exit 1
- fi
- fi
-}
-
-# Stop the service if it's already running
-stop_service_if_running
-
-# Download the main Python script
-echo "๐ฅ Downloading the LXC AutoScale ML main script..."
-curl -sSL -o $INSTALL_PATH $SCRIPT_URL
-if [ $? -ne 0 ]; then
- echo "โ Error: Failed to download the main script."
- exit 1
-fi
-
-# Make the main script executable
-chmod +x $INSTALL_PATH
-
-# Download the systemd service file
-echo "๐ฅ Downloading the systemd service file..."
-curl -sSL -o $SERVICE_PATH $SERVICE_URL
-if [ $? -ne 0 ]; then
- echo "โ Error: Failed to download the service file."
- exit 1
-fi
-
-# Set up the configuration directory and file, with backup if needed
-echo "๐ Setting up configuration directory and file..."
-mkdir -p $CONF_DIR
- backup_existing_conf
- curl -sSL -o $YAML_CONF_PATH $CONF_URL
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to download the configuration file."
- exit 1
- fi
-
-# Reload systemd to recognize the new service
-echo "๐ Reloading systemd daemon..."
-systemctl daemon-reload
-
-# Enable and start the LXC AutoScale service
-enable_service
-start_service
-
-# Check the status of the service
-echo "๐ Checking service status..."
-systemctl status lxc_autoscale_ml.service --no-pager
-
-# Verify that the service is running
-if systemctl is-active --quiet lxc_autoscale_ml.service; then
- echo "โ
LXC AutoScale ML service is running successfully."
-else
- echo "โ Error: LXC AutoScale ML service failed to start."
- exit 1
-fi
-
-echo "๐ Installation and setup completed successfully."
diff --git a/lxc_autoscale_ml/install_monitor.sh b/lxc_autoscale_ml/install_monitor.sh
deleted file mode 100644
index 2372b64..0000000
--- a/lxc_autoscale_ml/install_monitor.sh
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/bin/bash
-
-# Variables
-
-# Monitor
-REPO_BASE_URL="https://raw.githubusercontent.com/fabriziosalmi/proxmox-lxc-autoscale/main"
-SCRIPT_URL="${REPO_BASE_URL}/lxc_autoscale_ml/monitor/lxc_monitor.py"
-SERVICE_URL="${REPO_BASE_URL}/lxc_autoscale_ml/monitor/lxc_monitor.service"
-CONF_URL="${REPO_BASE_URL}/lxc_autoscale_ml/monitor/lxc_monitor.yaml"
-
-INSTALL_PATH="/usr/local/bin/lxc_monitor.py"
-SERVICE_PATH="/etc/systemd/system/lxc_monitor.service"
-CONF_DIR="/etc/lxc_autoscale"
-YAML_CONF_PATH="${CONF_DIR}/lxc_monitor.yaml"
-LOG_PATH="/var/log/lxc_monitor.log"
-
-# Function to check and stop the service if running
-stop_service_if_running() {
- if systemctl is-active --quiet lxc_monitor.service; then
- echo "๐ Stopping LXC AutoScale Monitor service..."
- systemctl stop lxc_monitor.service
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to stop the service."
- exit 1
- fi
- fi
-}
-
-# Function to start the service
-start_service() {
- echo "๐ Starting the LXC AutoScale Monitor service..."
- systemctl start lxc_monitor.service
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to start the service."
- exit 1
- fi
-}
-
-# Function to enable the service
-enable_service() {
- echo "๐ง Enabling the LXC AutoScale Monitor service..."
- systemctl enable lxc_monitor.service
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to enable the service."
- exit 1
- fi
-}
-
-# Function to backup existing configuration file
-backup_existing_conf() {
- if [ -f "$YAML_CONF_PATH" ]; then
- timestamp=$(date +"%Y%m%d-%H%M%S")
- backup_conf="${YAML_CONF_PATH}.${timestamp}.backup"
- echo "๐พ Backing up existing configuration file to $backup_conf..."
- cp "$YAML_CONF_PATH" "$backup_conf"
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to backup the existing configuration file."
- exit 1
- fi
- fi
-}
-
-# Stop the service if it's already running
-stop_service_if_running
-
-# Download the main Python script
-echo "๐ฅ Downloading the LXC AutoScale Monitor main script..."
-curl -sSL -o $INSTALL_PATH $SCRIPT_URL
-if [ $? -ne 0 ]; then
- echo "โ Error: Failed to download the main script."
- exit 1
-fi
-
-# Make the main script executable
-chmod +x $INSTALL_PATH
-
-# Download the systemd service file
-echo "๐ฅ Downloading the systemd service file..."
-curl -sSL -o $SERVICE_PATH $SERVICE_URL
-if [ $? -ne 0 ]; then
- echo "โ Error: Failed to download the service file."
- exit 1
-fi
-
-# Set up the configuration directory and file, with backup if needed
-echo "๐ Setting up configuration directory and file..."
-mkdir -p $CONF_DIR
- backup_existing_conf
- curl -sSL -o $YAML_CONF_PATH $CONF_URL
- if [ $? -ne 0 ]; then
- echo "โ Error: Failed to download the configuration file."
- exit 1
- fi
-
-# Create the log file if it doesn't exist
-touch $LOG_PATH
-
-# Set the correct permissions
-echo "๐ง Setting permissions..."
-chown root:root $LOG_PATH
-chmod 644 $LOG_PATH
-
-# Reload systemd to recognize the new service
-echo "๐ Reloading systemd daemon..."
-systemctl daemon-reload
-
-# Enable and start the LXC AutoScale service
-enable_service
-start_service
-
-# Check the status of the service
-echo "๐ Checking service status..."
-systemctl status lxc_monitor.service --no-pager
-
-# Verify that the service is running
-if systemctl is-active --quiet lxc_monitor.service; then
- echo "โ
LXC Monitor service is running successfully."
-else
- echo "โ Error: LXC Monitor service failed to start."
- exit 1
-fi
-
-echo "๐ Installation and setup completed successfully."
diff --git a/lxc_autoscale_ml/model/README.md b/lxc_autoscale_ml/model/README.md
deleted file mode 100644
index c947534..0000000
--- a/lxc_autoscale_ml/model/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# LXC AutoScale ML (the model)
-
-- [Documentation](https://github.com/fabriziosalmi/proxmox-lxc-autoscale/blob/main/docs/lxc_model/README.md)
diff --git a/lxc_autoscale_ml/model/config_manager.py b/lxc_autoscale_ml/model/config_manager.py
deleted file mode 100644
index e97af79..0000000
--- a/lxc_autoscale_ml/model/config_manager.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import yaml
-import logging
-import os
-
-class ConfigError(Exception):
- pass
-
-def load_config(config_path, default_config=None):
- if not os.path.exists(config_path):
- logging.error(f"Configuration file not found: {config_path}")
- raise ConfigError(f"Configuration file not found: {config_path}")
-
- try:
- with open(config_path, 'r') as file:
- config = yaml.safe_load(file)
- logging.info(f"Configuration loaded from {config_path}")
- except yaml.YAMLError as e:
- logging.error(f"Error parsing YAML file: {e}")
- raise ConfigError(f"Error parsing YAML file: {e}")
- except Exception as e:
- logging.error(f"Unexpected error loading configuration: {e}")
- raise ConfigError(f"Unexpected error loading configuration: {e}")
-
- if default_config:
- config = {**default_config, **config}
- logging.debug(f"Configuration merged with default values.")
-
- required_keys = ['log_file', 'interval_seconds', 'api']
- for key in required_keys:
- if key not in config:
- logging.error(f"Missing required configuration key: {key}")
- raise ConfigError(f"Missing required configuration key: {key}")
-
- if 'api_url' not in config['api']:
- logging.error("Missing required configuration key: api_url")
- raise ConfigError("Missing required configuration key: api_url")
-
- logging.debug(f"Final configuration: {config}")
- return config
diff --git a/lxc_autoscale_ml/model/data_manager.py b/lxc_autoscale_ml/model/data_manager.py
deleted file mode 100644
index 6e61b9a..0000000
--- a/lxc_autoscale_ml/model/data_manager.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import pandas as pd
-import logging
-import json
-import numpy as np
-from sklearn.preprocessing import StandardScaler
-
-def load_data(file_path):
- try:
- with open(file_path, 'r') as f:
- data = json.load(f)
- logging.info(f"Data loaded successfully from {file_path}.")
- except FileNotFoundError:
- logging.error(f"File not found: {file_path}")
- return None
- except json.JSONDecodeError as e:
- logging.error(f"JSON decoding failed: {e}")
- return None
- except Exception as e:
- logging.error(f"Unexpected error loading data: {e}")
- return None
-
- if not data:
- logging.error(f"No data found in {file_path}.")
- return None
-
- records = []
- for snapshot in data:
- for container_id, metrics in snapshot.items():
- if container_id == "summary":
- continue # Skip the summary, focus on container data
-
- try:
- record = {
- "container_id": container_id,
- "timestamp": metrics["timestamp"],
- "cpu_usage_percent": metrics["cpu_usage_percent"],
- "memory_usage_mb": metrics["memory_usage_mb"],
- "swap_usage_mb": metrics.get("swap_usage_mb", 0),
- "swap_total_mb": metrics.get("swap_total_mb", 0),
- "process_count": metrics["process_count"],
- "io_reads": metrics["io_stats"]["reads"],
- "io_writes": metrics["io_stats"]["writes"],
- "network_rx_bytes": metrics["network_usage"]["rx_bytes"],
- "network_tx_bytes": metrics["network_usage"]["tx_bytes"],
- "filesystem_usage_gb": metrics["filesystem_usage_gb"],
- "filesystem_total_gb": metrics["filesystem_total_gb"],
- "filesystem_free_gb": metrics["filesystem_free_gb"],
- }
- records.append(record)
- except KeyError as e:
- logging.warning(f"Missing expected metric {e} in container {container_id}. Skipping.")
- continue
-
- if not records:
- logging.error("No valid records found in the data.")
- return None
-
- df = pd.DataFrame(records)
- try:
- df['timestamp'] = pd.to_datetime(df['timestamp'])
- except Exception as e:
- logging.error(f"Error converting timestamps: {e}")
- return None
-
- logging.info("Data preprocessed successfully.")
- return df
-
-
-def preprocess_data(df, config):
- if df.empty:
- logging.error("DataFrame is empty. Skipping preprocessing.")
- return df
-
- try:
- spike_threshold = config.get('spike_detection', {}).get('spike_threshold', 2)
- rolling_window_size = config.get('rolling_window', 5)
-
- # Derived metrics
- df['cpu_per_process'] = df['cpu_usage_percent'] / df['process_count']
- df['memory_per_process'] = df['memory_usage_mb'] / df['process_count']
- df['cpu_memory_ratio'] = df['cpu_usage_percent'] / df['memory_usage_mb'] # New metric: CPU to Memory Ratio
-
- # Fix for time_diff calculation
- try:
- df['time_diff'] = df.groupby('container_id')['timestamp'].diff().dt.total_seconds()
- df['time_diff'].fillna(0, inplace=True)
- except Exception as e:
- logging.error(f"Error calculating 'time_diff': {e}")
- df['time_diff'] = 0 # Default to 0 in case of errors
-
- # Rolling statistics for spike detection and trend analysis
- df['rolling_mean_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).mean())
- df['rolling_std_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).std()).fillna(0)
- df['rolling_mean_memory'] = df.groupby('container_id')['memory_usage_mb'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).mean())
- df['rolling_std_memory'] = df.groupby('container_id')['memory_usage_mb'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).std()).fillna(0)
-
- # Spike detection
- df['cpu_spike'] = np.abs(df['cpu_usage_percent'] - df['rolling_mean_cpu']) > (spike_threshold * df['rolling_std_cpu'])
- df['memory_spike'] = np.abs(df['memory_usage_mb'] - df['rolling_mean_memory']) > (spike_threshold * df['rolling_std_memory'])
-
- # Trend detection using the slope of the rolling window
- df['cpu_trend'] = df.groupby('container_id')['cpu_usage_percent'].transform(
- lambda x: np.polyfit(np.arange(len(x)), x.rolling(window=rolling_window_size, min_periods=1).mean(), 1)[0])
- df['memory_trend'] = df.groupby('container_id')['memory_usage_mb'].transform(
- lambda x: np.polyfit(np.arange(len(x)), x.rolling(window=rolling_window_size, min_periods=1).mean(), 1)[0])
-
- # Aggregated features
- df['max_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).max())
- df['min_cpu'] = df.groupby('container_id')['cpu_usage_percent'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).min())
- df['max_memory'] = df.groupby('container_id')['memory_usage_mb'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).max())
- df['min_memory'] = df.groupby('container_id')['memory_usage_mb'].transform(
- lambda x: x.rolling(window=rolling_window_size, min_periods=1).min())
-
- # Feature scaling
- scaler = StandardScaler()
- features_to_scale = [
- 'cpu_usage_percent', 'memory_usage_mb', 'cpu_per_process', 'memory_per_process', 'time_diff',
- 'cpu_trend', 'memory_trend', 'max_cpu', 'min_cpu', 'max_memory', 'min_memory',
- 'cpu_memory_ratio'
- ]
- df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
-
- logging.info("Feature engineering, spike detection, and trend detection completed.")
- except Exception as e:
- logging.error(f"Error during data preprocessing: {e}")
- return df
-
- return df
diff --git a/lxc_autoscale_ml/model/lock_manager.py b/lxc_autoscale_ml/model/lock_manager.py
deleted file mode 100644
index 1e29b40..0000000
--- a/lxc_autoscale_ml/model/lock_manager.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import os
-import sys
-import logging
-
-def create_lock_file(lock_file):
- if os.path.exists(lock_file):
- logging.error("Another instance of the script is already running. Exiting.")
- sys.exit(1)
- else:
- with open(lock_file, 'w') as lf:
- lf.write(str(os.getpid()))
- logging.info(f"Lock file created at {lock_file}.")
-
-def remove_lock_file(lock_file):
- if os.path.exists(lock_file):
- os.remove(lock_file)
- logging.info(f"Lock file {lock_file} removed.")
- else:
- logging.warning(f"Lock file {lock_file} was not found.")
diff --git a/lxc_autoscale_ml/model/logger.py b/lxc_autoscale_ml/model/logger.py
deleted file mode 100644
index e1772b7..0000000
--- a/lxc_autoscale_ml/model/logger.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import logging
-from logging.handlers import RotatingFileHandler
-
-import logging
-
-def setup_logging(log_file, log_level="INFO"):
- """
- Set up logging to log to both a file and the console.
-
- :param log_file: Path to the log file.
- :param log_level: The logging level (default is "INFO").
- """
- # Convert log level string to logging level
- log_level = getattr(logging, log_level.upper(), logging.INFO)
-
- # Create a logger
- logger = logging.getLogger()
- logger.setLevel(log_level)
-
- # Create file handler
- file_handler = logging.FileHandler(log_file)
- file_handler.setLevel(log_level)
-
- # Create console handler
- console_handler = logging.StreamHandler()
- console_handler.setLevel(log_level)
-
- # Define log format
- formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
- file_handler.setFormatter(formatter)
- console_handler.setFormatter(formatter)
-
- # Add handlers to the logger
- logger.addHandler(file_handler)
- logger.addHandler(console_handler)
diff --git a/lxc_autoscale_ml/model/lxc_autoscale_ml.py b/lxc_autoscale_ml/model/lxc_autoscale_ml.py
deleted file mode 100644
index 1745eeb..0000000
--- a/lxc_autoscale_ml/model/lxc_autoscale_ml.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import time
-import logging
-import pandas as pd
-
-# Ensure all modules in the lxc_autoscale_ml directory are accessible
-sys.path.append('/usr/local/bin/lxc_autoscale_ml')
-
-# Import custom modules
-from logger import setup_logging
-from lock_manager import create_lock_file, remove_lock_file
-from config_manager import load_config
-from data_manager import load_data, preprocess_data
-from model import train_anomaly_models, predict_anomalies
-from scaling import determine_scaling_action, apply_scaling
-from signal_handler import setup_signal_handlers
-
-
-def main():
- config = load_config("/etc/lxc_autoscale_ml/lxc_autoscale_ml.yaml")
- setup_logging(config.get("log_file", "/var/log/lxc_autoscale_ml.log"))
-
- logging.info("Starting the LXC auto-scaling script...")
-
- create_lock_file(config.get("lock_file", "/tmp/lxc_autoscale_ml.lock"))
-
- try:
- while True:
- # Load and preprocess data
- df = load_data(config.get("data_file", "/var/log/lxc_metrics.json"))
- if df is None:
- logging.error("Exiting due to data loading error.")
- return
-
- df = preprocess_data(df, config)
-
- # Train anomaly detection models
- model, features_to_use = train_anomaly_models(df, config)
- if model is None:
- logging.error("Model training failed. Exiting.")
- return
-
- logging.info("Processing containers for scaling decisions...")
-
- # Iterate over each container and make scaling decisions
- for container_id in df["container_id"].unique():
- container_data = df[df["container_id"] == container_id]
- latest_metrics = container_data.iloc[-1]
-
- logging.debug(f"Latest metrics for container {container_id}: {latest_metrics.to_dict()}")
-
- scaling_decision, confidence = predict_anomalies(model, latest_metrics, features_to_use, config)
-
- if scaling_decision is not None:
- cpu_action, ram_action, new_cores, new_ram = determine_scaling_action(latest_metrics, scaling_decision, confidence, config)
- logging.debug(f"Scaling decision for container {container_id}: CPU - {cpu_action}, RAM - {ram_action} | Confidence: {confidence:.2f}%")
-
- if cpu_action != "No Scaling" or ram_action != "No Scaling":
- logging.info(f"Applying scaling actions for container {container_id}: CPU - {cpu_action}, RAM - {ram_action} | Confidence: {confidence:.2f}%")
- apply_scaling(container_id, new_cores, new_ram, config)
- else:
- logging.info(f"No scaling needed for container {container_id}. | Confidence: {confidence:.2f}%")
- else:
- logging.warning(f"Skipping scaling for container {container_id} due to lack of prediction.")
-
- # Sleep until the next interval
- logging.info(f"Sleeping for {config.get('interval_seconds', 60)} seconds before the next run.")
- time.sleep(config.get("interval_seconds", 60))
-
- except Exception as e:
- logging.error(f"An error occurred: {e}")
- finally:
- remove_lock_file(config.get("lock_file", "/tmp/lxc_autoscale_ml.lock"))
- logging.info("Script execution completed.")
-
-if __name__ == "__main__":
- # Setup signal handlers to ensure graceful shutdown
- setup_signal_handlers()
- main()
diff --git a/lxc_autoscale_ml/model/lxc_autoscale_ml.service b/lxc_autoscale_ml/model/lxc_autoscale_ml.service
deleted file mode 100644
index 982ff40..0000000
--- a/lxc_autoscale_ml/model/lxc_autoscale_ml.service
+++ /dev/null
@@ -1,19 +0,0 @@
-[Unit]
-Description=LXC AutoScale ML Service
-Documentation=https://github.com/fabriziosalmi/proxmox-lxc-autoscale
-After=network.target
-
-[Service]
-ExecStart=/usr/bin/python3 /usr/local/bin/lxc_autoscale_ml/lxc_autoscale_ml.py
-WorkingDirectory=/usr/local/bin/lxc_autoscale_ml
-StandardOutput=inherit
-StandardError=inherit
-Restart=on-failure
-User=root
-
-# Logging configuration
-Environment="PYTHONUNBUFFERED=1"
-EnvironmentFile=/etc/lxc_autoscale_ml/lxc_autoscale_ml.yaml
-
-[Install]
-WantedBy=multi-user.target
diff --git a/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml b/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml
deleted file mode 100644
index 77a2997..0000000
--- a/lxc_autoscale_ml/model/lxc_autoscale_ml.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-# =============================================================
-# LXC AutoScale ML
-# Automated Container Scaling Configuration
-# -------------------------------------------------------------
-# This configuration file controls the behavior of the LXC
-# AutoScale ML script. It includes settings for logging,
-# model training, spike detection, scaling actions, API
-# interactions, and more.
-#
-# The script is designed to automatically scale LXC containers
-# based on real-time metrics, such as CPU usage, memory usage,
-# and other system indicators. The scaling decisions are made
-# using machine learning models, which can detect anomalies
-# and trends in the data.
-#
-# Author: Fabrizio Salmi - fabrizio.salmi@gmail.com
-# Date: August 20, 2024
-# =============================================================
-
-# Logging Configuration
-log_file: "/var/log/lxc_autoscale_ml.log" # Path to the log file
-log_level: "DEBUG" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-
-# Lock File Configuration
-lock_file: "/tmp/lxc_autoscale_ml.lock" # Path to the lock file to prevent multiple instances
-
-# Data File Configuration
-data_file: "/var/log/lxc_metrics.json" # Path to the metrics file containing container data produced by LXC AutoScale API
-
-# Model Configuration
-model:
- contamination: 0.05 # Contamination level for IsolationForest (fraction of outliers)
- n_estimators: 100 # Number of trees in IsolationForest
- max_samples: 64 # Number of samples to draw for training each tree
- random_state: 42 # Random seed for reproducibility
-
-# Spike Detection Configuration
-spike_detection:
- spike_threshold: 2 # Number of standard deviations for spike detection
- rolling_window: 5 # Window size for rolling mean and standard deviation
-
-# Scaling Configuration
-scaling:
- total_cores: 4 # Total number of CPU cores available on the server
- total_ram_mb: 16384 # Total RAM available on the server in MB
- target_cpu_load_percent: 50 # Target CPU load percentage after scaling
- max_cpu_cores: 4 # Maximum number of CPU cores to maintain per container
- max_ram_mb: 8192 # Maximum RAM to maintain per container in MB
- min_cpu_cores: 2 # Minimum number of CPU cores to maintain per container
- min_ram_mb: 1024 # Minimum RAM to maintain per container in MB
- cpu_scale_up_threshold: 75 # CPU usage percentage to trigger scale-up
- cpu_scale_down_threshold: 30 # CPU usage percentage to trigger scale-down
- ram_scale_up_threshold: 75 # RAM usage percentage to trigger scale-up
- ram_scale_down_threshold: 30 # RAM usage percentage to trigger scale-down
- ram_chunk_size: 50 # Minimum RAM scaling chunk size in MB
- ram_upper_limit: 1024 # Maximum RAM scaling limit in one step in MB
- dry_run: false # If true, perform a dry run without making actual API calls
-
-# API Configuration
-api:
- api_url: "http://127.0.0.1:5000" # Base URL for the API used for scaling actions
- cores_endpoint: "/scale/cores" # Endpoint for scaling CPU cores
- ram_endpoint: "/scale/ram" # Endpoint for scaling RAM
-
-# Retry Logic for API Calls
-retry_logic:
- max_retries: 3 # Maximum number of retries for API calls
- retry_delay: 2 # Delay between retries in seconds
-
-# Interval Configuration
-interval_seconds: 60 # Time interval between consecutive script runs in seconds
-
-# Feature Engineering Configuration
-feature_engineering:
- include_io_activity: true # Include IO activity as a feature in the model
- include_network_activity: true # Include network activity as a feature in the model
-
-# Prediction Configuration
-prediction:
- use_latest_only: true # If true, use only the latest data point for prediction
- include_rolling_features: true # Include rolling mean and std features for prediction
-
-# Ignored Containers
-ignore_lxc:
- - "101" # List of container IDs to ignore from scaling
- - "102"
diff --git a/lxc_autoscale_ml/model/model.py b/lxc_autoscale_ml/model/model.py
deleted file mode 100644
index 36caeca..0000000
--- a/lxc_autoscale_ml/model/model.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from sklearn.ensemble import IsolationForest
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
-import logging
-import numpy as np
-import pandas as pd
-
-def train_anomaly_models(df, config):
- # Select only numeric features for training, excluding non-relevant columns
- features_to_use = df.select_dtypes(include=[np.number]).columns.difference(['container_id', 'timestamp'])
- X_train = df[features_to_use]
-
- logging.info(f"Features used for training: {list(features_to_use)}")
-
- model = IsolationForest(
- contamination=config.get('model', {}).get('contamination', 0.05),
- n_estimators=config.get('model', {}).get('n_estimators', 100),
- max_samples=config.get('model', {}).get('max_samples', 64),
- random_state=config.get('model', {}).get('random_state', 42)
- )
-
- try:
- pipeline = Pipeline([
- ('scaler', StandardScaler()),
- ('IsolationForest', model)
- ])
- pipeline.fit(X_train)
- logging.info("IsolationForest model training completed.")
- return pipeline, features_to_use # Return both the model and the feature set used
- except Exception as e:
- logging.error(f"Error during model training: {e}")
- return None, None
-
-
-def predict_anomalies(model, latest_metrics, features_to_use, config):
- # Ensure only the relevant features are used for prediction
- latest_metrics_df = latest_metrics[features_to_use].to_frame().T
-
- logging.debug(f"Features used for prediction: {latest_metrics_df.columns.tolist()}")
-
- try:
- anomaly_score = model.decision_function(latest_metrics_df)
- # Convert anomaly_score to a scalar if it's an array
- anomaly_score = anomaly_score.item() if isinstance(anomaly_score, np.ndarray) else anomaly_score
- # Normalize the anomaly score to a confidence level (0 to 100%)
- confidence = (1 - anomaly_score) * 100 # Inverse because lower scores mean more abnormal
- logging.debug(f"Anomaly score: {anomaly_score}, Confidence: {confidence:.2f}%")
- prediction = model.predict(latest_metrics_df)
- prediction = prediction.item() if isinstance(prediction, np.ndarray) else prediction
- return prediction, confidence
- except Exception as e:
- logging.error(f"Error during prediction with IsolationForest: {e}")
- return None, 0
diff --git a/lxc_autoscale_ml/model/scaling.py b/lxc_autoscale_ml/model/scaling.py
deleted file mode 100644
index 1a86ed0..0000000
--- a/lxc_autoscale_ml/model/scaling.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import requests
-import logging
-import time
-import sys
-import pandas as pd
-
-# Ensure all modules in the lxc_autoscale_ml directory are accessible
-sys.path.append('/usr/local/bin/lxc_autoscale_ml')
-
-# Import custom modules
-from logger import setup_logging
-from lock_manager import create_lock_file, remove_lock_file
-from config_manager import load_config
-from model import train_anomaly_models, predict_anomalies
-from signal_handler import setup_signal_handlers
-
-def determine_scaling_action(latest_metrics, scaling_decision, confidence, config):
- cpu_action = "No Scaling"
- ram_action = "No Scaling"
- new_cores = None
- new_ram = None
-
- cpu_usage = latest_metrics["cpu_usage_percent"]
- memory_usage = latest_metrics["memory_usage_mb"]
- cpu_memory_ratio = latest_metrics.get("cpu_memory_ratio", None)
- io_ops_per_second = latest_metrics.get("io_ops_per_second", None)
-
- logging.debug(f"CPU usage: {cpu_usage}% | Memory usage: {memory_usage}MB | Confidence: {confidence}%")
-
- cpu_thresholds = config["scaling"]
- ram_thresholds = config["scaling"]
-
- if scaling_decision:
- logging.debug("Anomaly detected. Scaling up CPU and RAM.")
- cpu_action = "Scale Up"
- ram_action = "Scale Up"
- else:
- if cpu_usage > cpu_thresholds["cpu_scale_up_threshold"]:
- cpu_action = "Scale Up"
- logging.debug(f"CPU usage {cpu_usage}% exceeds the scale-up threshold.")
- elif cpu_usage < cpu_thresholds["cpu_scale_down_threshold"]:
- cpu_action = "Scale Down"
- logging.debug(f"CPU usage {cpu_usage}% is below the scale-down threshold.")
-
- if memory_usage > ram_thresholds["ram_scale_up_threshold"]:
- ram_action = "Scale Up"
- logging.debug(f"Memory usage {memory_usage}MB exceeds the scale-up threshold.")
- elif memory_usage < ram_thresholds["ram_scale_down_threshold"]:
- ram_action = "Scale Down"
- logging.debug(f"Memory usage {memory_usage}MB is below the scale-down threshold.")
-
- # Ensure scaling stays within limits
- if cpu_action == "Scale Up":
- new_cores = min(cpu_thresholds["total_cores"], cpu_thresholds["max_cpu_cores"])
- elif cpu_action == "Scale Down":
- new_cores = max(cpu_thresholds["min_cpu_cores"], cpu_thresholds["min_cpu_cores"])
-
- if ram_action == "Scale Up":
- new_ram = min(ram_thresholds["total_ram_mb"], ram_thresholds["max_ram_mb"])
- elif ram_action == "Scale Down":
- new_ram = max(ram_thresholds["min_ram_mb"], ram_thresholds["min_ram_mb"])
-
- logging.debug(f"Final scaling actions: CPU -> {cpu_action}, RAM -> {ram_action} | Confidence: {confidence}%")
- return cpu_action, ram_action, new_cores, new_ram
-
-
-
-
-def apply_scaling(lxc_id, new_cores, new_ram, config):
- max_retries = config.get("retry_logic", {}).get("max_retries", 3)
- retry_delay = config.get("retry_logic", {}).get("retry_delay", 2)
- base_url = config["api"]["api_url"]
- cores_endpoint = config["api"].get("cores_endpoint", "/scale/cores")
- ram_endpoint = config["api"].get("ram_endpoint", "/scale/ram")
-
- def perform_request(url, data, resource_type):
- resource_key = "cores" if resource_type == "CPU" else "memory"
- for attempt in range(max_retries):
- try:
- response = requests.post(url, json=data)
- response.raise_for_status()
- logging.info(f"Successfully scaled {resource_type} for LXC ID {lxc_id} to {data[resource_key]} {resource_type} units.")
- return True
- except requests.RequestException as e:
- if response.status_code == 500:
- logging.error(f"Server error (500) encountered on attempt {attempt + 1} to scale {resource_type} for LXC ID {lxc_id}. Aborting further attempts.")
- break # Skip further retries for 500 errors
- logging.error(f"Attempt {attempt + 1} failed to scale {resource_type} for LXC ID {lxc_id}: {e}")
- if attempt < max_retries - 1:
- logging.info(f"Retrying in {retry_delay} seconds...")
- time.sleep(retry_delay)
- else:
- logging.error(f"Scaling {resource_type} for LXC ID {lxc_id} failed after {max_retries} attempts.")
- return False
- return False
-
- if new_cores is not None:
- cpu_data = {"vm_id": lxc_id, "cores": new_cores}
- cpu_url = f"{base_url}{cores_endpoint}"
- if not perform_request(cpu_url, cpu_data, "CPU"):
- logging.error(f"Scaling operation aborted for LXC ID {lxc_id} due to CPU scaling failure.")
-
- if new_ram is not None:
- ram_data = {"vm_id": lxc_id, "memory": new_ram}
- ram_url = f"{base_url}{ram_endpoint}"
- if not perform_request(ram_url, ram_data, "RAM"):
- logging.error(f"Scaling operation aborted for LXC ID {lxc_id} due to RAM scaling failure.")
diff --git a/lxc_autoscale_ml/model/signal_handler.py b/lxc_autoscale_ml/model/signal_handler.py
deleted file mode 100644
index 95cf7bf..0000000
--- a/lxc_autoscale_ml/model/signal_handler.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import signal
-import logging
-import sys
-
-def setup_signal_handlers(cleanup_function=None):
- """
- Sets up signal handlers to handle termination signals (SIGINT and SIGTERM) gracefully.
-
- :param cleanup_function: A function to be called for cleanup before exiting (optional).
- """
- def signal_handler(signum, frame):
- """
- Handles received signals, logs them, and exits the program gracefully.
-
- :param signum: The signal number received.
- :param frame: The current stack frame (not used).
- """
- signal_name = signal.Signals(signum).name # Get a more descriptive signal name
- logging.info(f"Received {signal_name} ({signum}). Exiting gracefully.")
-
- # Call the cleanup function if provided
- if cleanup_function:
- logging.info("Performing cleanup before exiting...")
- try:
- cleanup_function()
- except Exception as e:
- logging.error(f"Error during cleanup: {e}")
-
- sys.exit(0)
-
- # Register signal handlers
- signal.signal(signal.SIGINT, signal_handler)
- signal.signal(signal.SIGTERM, signal_handler)
- logging.info("Signal handlers for SIGINT and SIGTERM are set up.")
diff --git a/lxc_autoscale_ml/monitor/README.md b/lxc_autoscale_ml/monitor/README.md
deleted file mode 100644
index ea3e167..0000000
--- a/lxc_autoscale_ml/monitor/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# LXC Monitor
-
-- [Documentation](https://github.com/fabriziosalmi/proxmox-lxc-autoscale/blob/main/docs/lxc_monitor/README.md)
diff --git a/lxc_autoscale_ml/monitor/lxc_monitor.py b/lxc_autoscale_ml/monitor/lxc_monitor.py
deleted file mode 100644
index 9065d4e..0000000
--- a/lxc_autoscale_ml/monitor/lxc_monitor.py
+++ /dev/null
@@ -1,327 +0,0 @@
-import asyncio
-import json
-import logging
-import os
-import yaml
-import psutil
-from logging.handlers import TimedRotatingFileHandler
-from subprocess import check_output, CalledProcessError
-from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Dict, List, Optional, Tuple
-import aiofiles
-
-# Load configuration from YAML file
-with open("/etc/lxc_autoscale/lxc_monitor.yaml", 'r') as config_file:
- config = yaml.safe_load(config_file)
-
-# Set up logging configuration
-LOG_FILE = config['logging']['log_file']
-LOG_MAX_BYTES = config['logging']['log_max_bytes']
-LOG_BACKUP_COUNT = config['logging']['log_backup_count']
-LOG_LEVEL = getattr(logging, config['logging']['log_level'].upper(), logging.INFO)
-
-# Configure structured logging with time-based rotation and retention
-logger = logging.getLogger("LXCMonitor")
-logger.setLevel(LOG_LEVEL)
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-
-# Console handler
-console_handler = logging.StreamHandler()
-console_handler.setFormatter(formatter)
-
-# Timed rotating file handler with log retention
-file_handler = TimedRotatingFileHandler(LOG_FILE, when="midnight", interval=1, backupCount=LOG_BACKUP_COUNT)
-file_handler.setFormatter(formatter)
-file_handler.suffix = "%Y-%m-%d"
-
-# Add handlers to the logger
-logger.addHandler(console_handler)
-logger.addHandler(file_handler)
-
-# Monitoring configuration
-EXPORT_FILE = config['monitoring']['export_file']
-CHECK_INTERVAL = config['monitoring']['check_interval']
-ENABLE_SWAP = config['monitoring']['enable_swap']
-ENABLE_NETWORK = config['monitoring']['enable_network']
-ENABLE_FILESYSTEM = config['monitoring']['enable_filesystem']
-PARALLEL_PROCESSING = config['monitoring']['parallel_processing']
-MAX_WORKERS = config['monitoring']['max_workers']
-EXCLUDED_DEVICES = config['monitoring']['excluded_devices']
-RETRY_LIMIT = config['monitoring'].get('retry_limit', 3) # Maximum retry attempts
-RETRY_DELAY = config['monitoring'].get('retry_delay', 2) # Delay between retries in seconds
-
-def get_running_lxc_containers() -> List[str]:
- """Retrieve a list of running LXC containers."""
- try:
- pct_output = check_output(['pct', 'list'], text=True).splitlines()
- return [line.split()[0] for line in pct_output if 'running' in line]
- except CalledProcessError as e:
- logger.error(f"Error retrieving LXC containers: {e}")
- return []
-
-def run_command(command: List[str]) -> Optional[str]:
- """Run a shell command in a thread pool."""
- try:
- return check_output(command, text=True)
- except CalledProcessError as e:
- logger.error(f"Command failed: {' '.join(command)}, error: {e}")
- return None
-
-async def retry_on_failure(func: Any, *args, **kwargs) -> Any:
- """Retry logic wrapper for transient failures."""
- for attempt in range(RETRY_LIMIT):
- try:
- return await func(*args, **kwargs)
- except Exception as e:
- logger.warning(f"Attempt {attempt + 1} failed for {func.__name__} with error: {e}")
- if attempt < RETRY_LIMIT - 1:
- await asyncio.sleep(RETRY_DELAY)
- else:
- logger.error(f"All {RETRY_LIMIT} attempts failed for {func.__name__}.")
- raise
-
-async def get_container_metric(command: List[str], executor: ThreadPoolExecutor) -> Optional[str]:
- """Helper function to execute commands asynchronously in containers."""
- return await asyncio.get_event_loop().run_in_executor(executor, run_command, command)
-
-async def parse_meminfo(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, float]:
- """Retrieve memory and swap usage inside the container."""
- mem_info = {}
- for metric, key in [('MemTotal', 'memory_usage_mb'), ('MemAvailable', 'memory_free_mb'),
- ('SwapTotal', 'swap_total_mb'), ('SwapFree', 'swap_free_mb')]:
- command = ['pct', 'exec', container_id, '--', 'grep', metric, '/proc/meminfo']
- result = await get_container_metric(command, executor)
- if result:
- try:
- mem_info[key] = int(result.split()[1]) / 1024 # Convert to MB
- except ValueError:
- logger.warning(f"Unexpected memory info format for container {container_id}: {result}")
- # Calculate used memory and swap usage
- mem_info['memory_usage_mb'] = mem_info.get('memory_usage_mb', 0.0) - mem_info.get('memory_free_mb', 0.0)
- mem_info['swap_usage_mb'] = mem_info.get('swap_total_mb', 0.0) - mem_info.get('swap_free_mb', 0.0)
- return mem_info
-
-async def get_container_cpu_usage(container_id: str, executor: ThreadPoolExecutor) -> float:
- """Use pct exec to retrieve CPU usage inside the container."""
- command = ['pct', 'exec', container_id, '--', 'grep', 'cpu ', '/proc/stat']
- result = await get_container_metric(command, executor)
- if result:
- fields = result.split()
- if len(fields) < 5:
- logger.warning(f"Unexpected CPU stat format for container {container_id}: {fields}")
- return 0.0
- idle_time = int(fields[4])
- total_time = sum(int(field) for field in fields[1:])
- return 100 * (1 - (idle_time / total_time))
- return 0.0
-
-async def get_container_io_stats(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, int]:
- """Retrieve I/O statistics inside the container."""
- command = ['pct', 'exec', container_id, '--', 'grep', '', '/proc/diskstats']
- result = await get_container_metric(command, executor)
- if result:
- io_stats_lines = result.splitlines()
- io_stats = {"reads": 0, "writes": 0}
- for line in io_stats_lines:
- fields = line.split()
- if len(fields) < 10:
- logger.warning(f"Unexpected disk stats format for container {container_id}: {fields}")
- continue
- device = fields[2]
- if any(device.startswith(exclude) for exclude in EXCLUDED_DEVICES):
- continue
- io_stats["reads"] += int(fields[5])
- io_stats["writes"] += int(fields[9])
- return io_stats
- return {}
-
-async def get_container_network_usage(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, int]:
- """Retrieve network usage inside the container."""
- if not ENABLE_NETWORK:
- return {"rx_bytes": 0, "tx_bytes": 0}
-
- command = ['pct', 'exec', container_id, '--', 'cat', '/proc/net/dev']
- result = await get_container_metric(command, executor)
- if result:
- net_stats_lines = result.splitlines()[2:] # Skip headers
- rx_bytes, tx_bytes = 0, 0
- for line in net_stats_lines:
- fields = line.split()
- if len(fields) < 10:
- logger.warning(f"Unexpected network stats format for container {container_id}: {fields}")
- continue
- iface = fields[0].split(':')[0]
- if iface != 'lo': # Ignore loopback interface
- rx_bytes += int(fields[1])
- tx_bytes += int(fields[9])
- return {"rx_bytes": rx_bytes, "tx_bytes": tx_bytes}
- return {"rx_bytes": 0, "tx_bytes": 0}
-
-async def get_container_filesystem_usage(container_id: str, executor: ThreadPoolExecutor) -> Dict[str, float]:
- """Retrieve filesystem usage inside the container."""
- if not ENABLE_FILESYSTEM:
- return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0}
-
- command = ['pct', 'exec', container_id, '--', 'df', '-m', '/']
- result = await get_container_metric(command, executor)
- if result:
- lines = result.splitlines()
- if len(lines) < 2:
- logger.warning(f"Unexpected filesystem stats format for container {container_id}: {lines}")
- return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0}
-
- filesystem_stats = lines[1].split()
- if len(filesystem_stats) < 4:
- logger.warning(f"Incomplete filesystem stats for container {container_id}: {filesystem_stats}")
- return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0}
-
- filesystem_total_gb = int(filesystem_stats[1]) / 1024 # Convert MB to GB
- filesystem_usage_gb = int(filesystem_stats[2]) / 1024 # Convert MB to GB
- filesystem_free_gb = int(filesystem_stats[3]) / 1024 # Convert MB to GB
-
- return {
- "filesystem_usage_gb": filesystem_usage_gb,
- "filesystem_total_gb": filesystem_total_gb,
- "filesystem_free_gb": filesystem_free_gb
- }
- return {"filesystem_usage_gb": 0, "filesystem_total_gb": 0, "filesystem_free_gb": 0}
-
-async def get_container_process_count(container_id: str, executor: ThreadPoolExecutor) -> int:
- """Retrieve the number of processes running inside the container."""
- command = ['pct', 'exec', container_id, '--', 'ps', '-e']
- result = await get_container_metric(command, executor)
- if result:
- lines = result.splitlines()
- if lines and any(header in lines[0] for header in ["PID", "TTY", "TIME", "CMD"]):
- lines = lines[1:] # Remove header line
- return len(lines)
- return 0
-
-async def collect_metrics_for_container(container_id: str, executor: ThreadPoolExecutor) -> Tuple[str, Dict[str, Any]]:
- """Collect all metrics for a given container."""
- logger.info(f"Collecting metrics for container: {container_id}")
-
- # Use retry logic for each metric collection
- cpu_usage = await retry_on_failure(get_container_cpu_usage, container_id, executor)
- memory_swap_usage = await retry_on_failure(parse_meminfo, container_id, executor)
- io_stats = await retry_on_failure(get_container_io_stats, container_id, executor)
- network_usage = await retry_on_failure(get_container_network_usage, container_id, executor)
- filesystem_usage = await retry_on_failure(get_container_filesystem_usage, container_id, executor)
- process_count = await retry_on_failure(get_container_process_count, container_id, executor)
-
- container_metrics = {
- "timestamp": datetime.now().isoformat(),
- "cpu_usage_percent": cpu_usage,
- "memory_usage_mb": memory_swap_usage.get("memory_usage_mb", 0.0),
- "swap_usage_mb": memory_swap_usage.get("swap_usage_mb", 0.0),
- "swap_total_mb": memory_swap_usage.get("swap_total_mb", 0.0),
- "process_count": process_count,
- "io_stats": io_stats,
- "network_usage": network_usage,
- "filesystem_usage_gb": filesystem_usage["filesystem_usage_gb"],
- "filesystem_total_gb": filesystem_usage["filesystem_total_gb"],
- "filesystem_free_gb": filesystem_usage["filesystem_free_gb"]
- }
-
- logger.info(f"Metrics for {container_id}: {container_metrics}")
-
- return container_id, container_metrics
-
-async def collect_and_export_metrics():
- """Collect and export metrics for all running LXC containers."""
- start_time = datetime.now()
- metrics = {}
- containers = get_running_lxc_containers()
-
- if not containers:
- logger.info("No running LXC containers found.")
- return
-
- logger.debug(f"Found {len(containers)} running containers.")
-
- async with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
- if PARALLEL_PROCESSING:
- tasks = [collect_metrics_for_container(container_id, executor) for container_id in containers]
- results = await asyncio.gather(*tasks)
- else:
- results = []
- for container_id in containers:
- result = await collect_metrics_for_container(container_id, executor)
- results.append(result)
-
- # Ensure results are processed correctly
- for container_id, container_metrics in results:
- metrics[container_id] = container_metrics
-
- end_time = datetime.now()
- total_duration = (end_time - start_time).total_seconds()
-
- # Add summary to the JSON output
- summary = {
- "collection_start_time": start_time.isoformat(),
- "collection_end_time": end_time.isoformat(),
- "total_containers": len(containers),
- "total_duration_seconds": total_duration,
- "monitor_cpu_percent": psutil.cpu_percent(),
- "monitor_memory_usage_mb": psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
- }
- metrics["summary"] = summary
-
- # Load existing data if the file exists
- existing_data = await load_existing_data(EXPORT_FILE)
-
- # Append the new metrics
- existing_data.append(metrics)
-
- logger.debug(f"Appending new metrics: {metrics}")
-
- # Write the updated data to the file
- await write_metrics_to_file(EXPORT_FILE, existing_data)
-
-async def load_existing_data(file_path: str) -> List[Dict[str, Any]]:
- """Load existing data from the JSON file."""
- if not os.path.exists(file_path):
- return []
-
- try:
- async with aiofiles.open(file_path, mode='r') as json_file:
- content = await json_file.read()
- data = json.loads(content)
- if not isinstance(data, list):
- logger.error(f"Data in {file_path} is not a list. Resetting to an empty list.")
- return []
- logger.debug(f"Loaded existing metrics from {file_path}.")
- return data
- except (IOError, json.JSONDecodeError) as e:
- logger.warning(f"Failed to read existing data from {file_path}: {e}")
- return []
-
-async def write_metrics_to_file(file_path: str, data: List[Dict[str, Any]]):
- """Write metrics data to a JSON file asynchronously."""
- temp_file = f"{file_path}.tmp"
- try:
- async with aiofiles.open(temp_file, mode='w') as json_file:
- await json_file.write(json.dumps(data, indent=4))
- os.replace(temp_file, file_path)
- logger.info(f"Metrics successfully exported to {file_path}")
- except IOError as e:
- logger.error(f"Failed to write metrics to {file_path}: {e}")
- if os.path.exists(temp_file):
- os.remove(temp_file)
-
-async def monitor_and_export():
- """Continuously monitor and export metrics at the defined intervals."""
- try:
- while True:
- logger.info("Starting new metrics collection cycle.")
- await collect_and_export_metrics()
- logger.info(f"Waiting for {CHECK_INTERVAL} seconds before the next cycle.")
- await asyncio.sleep(CHECK_INTERVAL)
- except KeyboardInterrupt:
- logger.info("Shutting down metrics collector due to KeyboardInterrupt.")
- except Exception as e:
- logger.error(f"Unexpected error: {e}")
-
-if __name__ == "__main__":
- asyncio.run(monitor_and_export())
diff --git a/lxc_autoscale_ml/monitor/lxc_monitor.service b/lxc_autoscale_ml/monitor/lxc_monitor.service
deleted file mode 100644
index e28d983..0000000
--- a/lxc_autoscale_ml/monitor/lxc_monitor.service
+++ /dev/null
@@ -1,19 +0,0 @@
-[Unit]
-Description=LXC Monitor Service
-Documentation=https://github.com/fabriziosalmi/proxmox-lxc-autoscale
-After=network.target
-
-[Service]
-ExecStart=/usr/bin/python3 /usr/local/bin/lxc_monitor.py
-WorkingDirectory=/usr/local/bin/
-StandardOutput=inherit
-StandardError=inherit
-Restart=on-failure
-User=root
-
-# Logging configuration
-Environment="PYTHONUNBUFFERED=1"
-EnvironmentFile=/etc/lxc_autoscale_ml/lxc_monitor.yaml
-
-[Install]
-WantedBy=multi-user.target
diff --git a/lxc_autoscale_ml/monitor/lxc_monitor.yaml b/lxc_autoscale_ml/monitor/lxc_monitor.yaml
deleted file mode 100644
index ca1dcc9..0000000
--- a/lxc_autoscale_ml/monitor/lxc_monitor.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-logging:
- # Path to the file where log messages are written.
- log_file: "/var/log/lxc_monitor.log"
-
- # Maximum size of the log file before it is rotated (in bytes). Set to 5 MB here.
- log_max_bytes: 5242880 # 5 MB
-
- # Number of backup log files to keep. Older files are deleted as new ones are created.
- log_backup_count: 7
-
- # Logging level to determine the verbosity of log messages. Options include "DEBUG", "INFO", "WARNING", "ERROR", and "CRITICAL".
- log_level: "INFO"
-
-monitoring:
- # Path to the file where metrics data is exported in JSON format.
- export_file: "/var/log/lxc_metrics.json"
-
- # Interval (in seconds) at which the monitoring checks are performed.
- check_interval: 60 # seconds
-
- # Flag to enable or disable swap memory monitoring.
- enable_swap: true
-
- # Flag to enable or disable network statistics monitoring.
- enable_network: true
-
- # Flag to enable or disable filesystem statistics monitoring.
- enable_filesystem: true
-
- # Flag to enable or disable parallel processing of monitoring tasks.
- parallel_processing: true # Toggle for parallel processing
-
- # Maximum number of parallel workers to use if parallel_processing is enabled.
- max_workers: 8 # Max number of parallel workers if parallel_processing is enabled
-
- # List of device types to exclude from I/O statistics collection.
- excluded_devices: ['loop', 'dm-'] # Devices to exclude in I/O stats
diff --git a/lxc_autoscale_ml/tools/generate_metrics.py b/lxc_autoscale_ml/tools/generate_metrics.py
deleted file mode 100644
index 8c9ed83..0000000
--- a/lxc_autoscale_ml/tools/generate_metrics.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# This Python script generates fake metrics data for containers.
-# It can simulate various metrics such as CPU usage, memory usage, network activity, etc., with options to introduce randomness and spikes in the data.
-# The script supports generating data for multiple containers over a specified period and saving the output in JSON format.
-# Example use:
-# python3 generate_metrics.py --num-containers 5 --num-entries 100000 --interval-seconds 60 --randomness 1.0 --spike-likelihood 0.2 --spike-magnitude 1.0 --output-file fake_metrics.json
-
-
-import json
-import argparse
-import datetime
-import random
-
-def generate_metrics(container_id, base_time, interval_seconds, randomness_factor, spike_likelihood, spike_magnitude, base_metrics):
- """
- Generate a single set of metrics for a container, with optional spikes.
-
- Args:
- container_id (str): The ID of the container.
- base_time (datetime): The base timestamp to start from.
- interval_seconds (int): Time interval between each data point.
- randomness_factor (float): Factor to introduce variability in metrics.
- spike_likelihood (float): Probability of a spike occurring (0 to 1).
- spike_magnitude (float): Magnitude of the spike when it occurs.
- base_metrics (dict): Base metrics with initial values and variability ranges.
-
- Returns:
- dict: A dictionary containing the generated metrics.
- datetime: The updated base_time after the interval.
- """
- # Base metrics with added randomness and optional spikes
- cpu_usage_percent = base_metrics["cpu_usage_percent"] + (randomness_factor * base_metrics["cpu_variability"])
- memory_usage_mb = base_metrics["memory_usage_mb"] + (randomness_factor * base_metrics["memory_variability"])
- swap_usage_mb = base_metrics["swap_usage_mb"]
- swap_total_mb = base_metrics["swap_total_mb"]
- process_count = base_metrics["process_count"]
- io_reads = base_metrics["io_reads"] + int(randomness_factor * base_metrics["io_variability"])
- io_writes = base_metrics["io_writes"] + int(randomness_factor * base_metrics["io_variability"])
- network_rx_bytes = base_metrics["network_rx_bytes"] + int(randomness_factor * base_metrics["network_variability"])
- network_tx_bytes = base_metrics["network_tx_bytes"] + int(randomness_factor * base_metrics["network_variability"])
- filesystem_usage_gb = base_metrics["filesystem_usage_gb"]
- filesystem_total_gb = base_metrics["filesystem_total_gb"]
- filesystem_free_gb = base_metrics["filesystem_free_gb"]
-
- # Apply spikes based on the spike likelihood
- if random.random() < spike_likelihood:
- cpu_usage_percent *= (1 + spike_magnitude)
- memory_usage_mb *= (1 + spike_magnitude)
- io_reads *= (1 + spike_magnitude)
- io_writes *= (1 + spike_magnitude)
- network_rx_bytes *= (1 + spike_magnitude)
- network_tx_bytes *= (1 + spike_magnitude)
-
- timestamp = base_time + datetime.timedelta(seconds=interval_seconds)
- base_time = timestamp
-
- metrics = {
- container_id: {
- "timestamp": timestamp.isoformat(),
- "cpu_usage_percent": cpu_usage_percent,
- "memory_usage_mb": memory_usage_mb,
- "swap_usage_mb": swap_usage_mb,
- "swap_total_mb": swap_total_mb,
- "process_count": process_count,
- "io_stats": {
- "reads": io_reads,
- "writes": io_writes
- },
- "network_usage": {
- "rx_bytes": network_rx_bytes,
- "tx_bytes": network_tx_bytes
- },
- "filesystem_usage_gb": filesystem_usage_gb,
- "filesystem_total_gb": filesystem_total_gb,
- "filesystem_free_gb": filesystem_free_gb
- },
- "summary": {
- "collection_start_time": (timestamp - datetime.timedelta(seconds=6)).isoformat(),
- "collection_end_time": timestamp.isoformat(),
- "total_containers": 1,
- "total_duration_seconds": 6.0,
- "monitor_cpu_percent": cpu_usage_percent,
- "monitor_memory_usage_mb": memory_usage_mb
- }
- }
-
- return metrics, base_time
-
-def generate_dataset(container_ids, num_entries, interval_seconds, randomness_factor, spike_likelihood, spike_magnitude, base_metrics):
- """
- Generate a dataset of fake metrics for multiple containers, with optional spikes.
-
- Args:
- container_ids (list of str): List of container IDs.
- num_entries (int): Number of metric entries to generate for each container.
- interval_seconds (int): Time interval between each data point.
- randomness_factor (float): Factor to introduce variability in metrics.
- spike_likelihood (float): Probability of a spike occurring (0 to 1).
- spike_magnitude (float): Magnitude of the spike when it occurs.
- base_metrics (dict): Base metrics with initial values and variability ranges.
-
- Returns:
- list of dict: The generated dataset.
- """
- base_time = datetime.datetime.now()
- dataset = []
-
- for _ in range(num_entries):
- for container_id in container_ids:
- metrics, base_time = generate_metrics(
- container_id, base_time, interval_seconds, randomness_factor, spike_likelihood, spike_magnitude, base_metrics
- )
- dataset.append(metrics)
-
- return dataset
-
-def main():
- """
- Main function to parse arguments and generate the fake metrics dataset.
- """
- parser = argparse.ArgumentParser(description="Generate fake metrics data with optional spikes.")
- parser.add_argument(
- '--container-id', nargs='*', type=str,
- help="Container ID(s). If not provided, IDs from 100 to 999 will be used."
- )
- parser.add_argument(
- '--num-containers', type=int, default=1,
- help="Number of container IDs to generate (only used if --container-id is not provided)"
- )
- parser.add_argument(
- '--num-entries', type=int, default=10,
- help="Number of metric entries to generate for each container"
- )
- parser.add_argument(
- '--interval-seconds', type=int, default=60,
- help="Interval between data points in seconds"
- )
- parser.add_argument(
- '--randomness', type=float, default=1.0,
- help="Randomness factor for metric generation"
- )
- parser.add_argument(
- '--spike-likelihood', type=float, default=0.1,
- help="Likelihood of a spike occurring (0 to 1)"
- )
- parser.add_argument(
- '--spike-magnitude', type=float, default=0.5,
- help="Magnitude of the spike (as a percentage increase)"
- )
- parser.add_argument(
- '--output-file', type=str, default="fake_metrics.json",
- help="Output file to save the generated dataset"
- )
-
- # Default base metrics with variability ranges
- base_metrics = {
- "cpu_usage_percent": 5.0,
- "cpu_variability": 0.5,
- "memory_usage_mb": 96.0,
- "memory_variability": 0.8,
- "swap_usage_mb": 94.3359375,
- "swap_total_mb": 512.0,
- "process_count": 27,
- "io_reads": 19500000,
- "io_writes": 6900000,
- "io_variability": 50000,
- "network_rx_bytes": 950000,
- "network_tx_bytes": 57000,
- "network_variability": 10000,
- "filesystem_usage_gb": 1.5947265625,
- "filesystem_total_gb": 18.5341796875,
- "filesystem_free_gb": 16.0048828125
- }
-
- args = parser.parse_args()
-
- if args.container_id:
- container_ids = args.container_id
- else:
- start_id = 100
- end_id = start_id + args.num_containers
- end_id = min(end_id, 1000)
- container_ids = [str(i) for i in range(start_id, end_id)]
-
- dataset = generate_dataset(
- container_ids, args.num_entries, args.interval_seconds, args.randomness,
- args.spike_likelihood, args.spike_magnitude, base_metrics
- )
-
- with open(args.output_file, 'w', encoding='utf-8') as output_file:
- json.dump(dataset, output_file, indent=4)
-
- print(f"Generated {args.num_entries} entries of fake metrics data for "
- f"{len(container_ids)} containers with {args.interval_seconds}s intervals "
- f"and saved to {args.output_file}")
-
-if __name__ == "__main__":
- main()
diff --git a/lxc_autoscale_ml/uninstall.sh b/lxc_autoscale_ml/uninstall.sh
deleted file mode 100644
index 9233592..0000000
--- a/lxc_autoscale_ml/uninstall.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-# Log file for uninstallation
-LOGFILE="lxc_autoscale_uninstaller.log"
-
-# Define text styles and emojis
-BOLD=$(tput bold)
-RESET=$(tput sgr0)
-GREEN=$(tput setaf 2)
-RED=$(tput setaf 1)
-CHECKMARK="\xE2\x9C\x85" # โ๏ธ
-CROSSMARK="\xE2\x9D\x8C" # โ
-
-# Log function
-log() {
- local level="$1"
- local message="$2"
- local timestamp
- timestamp=$(date +"%Y-%m-%d %H:%M:%S")
- case $level in
- "INFO")
- echo -e "${timestamp} [${GREEN}${level}${RESET}] ${message}" | tee -a "$LOGFILE"
- ;;
- "ERROR")
- echo -e "${timestamp} [${RED}${level}${RESET}] ${message}" | tee -a "$LOGFILE"
- ;;
- esac
-}
-
-log "INFO" "Starting LXC AutoScale uninstallation..."
-
-# Function to stop and disable a service
-uninstall_service() {
- local service_name="$1"
- log "INFO" "Stopping and disabling the $service_name service..."
- if systemctl stop "$service_name" && systemctl disable "$service_name"; then
- log "INFO" "${CHECKMARK} Successfully stopped and disabled $service_name."
- else
- log "ERROR" "${CROSSMARK} Failed to stop or disable $service_name, or it was not found."
- fi
-}
-
-# Function to remove files and directories
-remove_files() {
- local files=("$@")
- for file in "${files[@]}"; do
- log "INFO" "Removing $file..."
- if rm -rf "$file"; then
- log "INFO" "${CHECKMARK} Successfully removed $file."
- else
- log "ERROR" "${CROSSMARK} Failed to remove $file, or it was not found."
- fi
- done
-}
-
-# Uninstall LXC AutoScale API
-log "INFO" "Uninstalling LXC AutoScale API..."
-uninstall_service "lxc_autoscale_api.service"
-remove_files "/usr/local/bin/lxc_autoscale_api" "/etc/systemd/system/lxc_autoscale_api.service" "/etc/lxc_autoscale/lxc_autoscale_api.yaml"
-
-# Uninstall LXC AutoScale ML
-log "INFO" "Uninstalling LXC AutoScale ML..."
-uninstall_service "lxc_autoscale_ml.service"
-remove_files "/usr/local/bin/lxc_autoscale_ml.py" "/etc/systemd/system/lxc_autoscale_ml.service" "/etc/lxc_autoscale/lxc_autoscale_ml.yaml"
-
-# Uninstall LXC Monitor
-log "INFO" "Uninstalling LXC Monitor..."
-uninstall_service "lxc_monitor.service"
-remove_files "/usr/local/bin/lxc_monitor.py" "/etc/systemd/system/lxc_monitor.service" "/etc/lxc_autoscale/lxc_monitor.yaml"
-
-# Cleanup shared configuration directory
-log "INFO" "Cleaning up shared configuration directory if empty..."
-if [ -d "/etc/lxc_autoscale" ] && [ ! "$(ls -A /etc/lxc_autoscale)" ]; then
- rmdir /etc/lxc_autoscale
- log "INFO" "${CHECKMARK} Successfully removed empty directory /etc/lxc_autoscale."
-fi
-
-# Final cleanup
-log "INFO" "Reloading systemd daemon to reflect changes..."
-systemctl daemon-reload
-
-log "INFO" "LXC AutoScale uninstallation complete!"