#!/usr/bin/env python # -*- coding: utf-8 -*- """ 实验状态监控服务 - 在等待状态下定时查询InfluxDB数据,检测实验状态变化 """ import threading import time import sqlite3 import datetime from typing import Optional, Callable, Dict, Any from logger import get_logger from influx_service import InfluxService, InfluxConnectionParams import pandas as pd logger = get_logger() class ExperimentStateMonitor: """实验状态监控器 - 定时查询InfluxDB数据并检测状态变化""" def __init__( self, experiment_id: int, work_order_no: str, start_time: datetime.datetime, influx_params: InfluxConnectionParams, query_config: Dict[str, Any], on_state_changed: Optional[Callable[[str, str], None]] = None, on_connection_changed: Optional[Callable[[bool, str], None]] = None, poll_interval: int = 5, max_retry_attempts: int = 3, retry_backoff_base: float = 2.0, max_consecutive_failures: int = 10 ): """ 初始化监控器 Args: experiment_id: 实验记录ID work_order_no: 工单号 start_time: 工单填写时间(查询此时间之后的数据) influx_params: InfluxDB连接参数 query_config: 查询配置,包含: - bucket: 数据桶名称 - measurement: 测量名称 - fields: 字段列表 - filters: 过滤条件 - status_field: 状态字段名称(用于检测状态变化) - status_values: 状态值配置 {'start': '开始值', 'end': '结束值'} on_state_changed: 状态变化回调函数 (old_state, new_state) on_connection_changed: 连接状态变化回调函数 (is_connected, message) poll_interval: 轮询间隔(秒) max_retry_attempts: 单次查询最大重试次数 retry_backoff_base: 重试退避基数(指数退避) max_consecutive_failures: 最大连续失败次数(超过后报告连接断开) """ self.experiment_id = experiment_id self.work_order_no = work_order_no self.start_time = start_time self.influx_params = influx_params self.query_config = query_config self.on_state_changed = on_state_changed self.on_connection_changed = on_connection_changed self.poll_interval = poll_interval self.max_retry_attempts = max_retry_attempts self.retry_backoff_base = retry_backoff_base self.max_consecutive_failures = max_consecutive_failures self._thread: Optional[threading.Thread] = None self._stop_event = threading.Event() self._last_state: Optional[str] = None self._experiment_started = False self._experiment_ended = False self._start_time_recorded: Optional[str] = None self._end_time_recorded: Optional[str] = None # 连接状态管理 self._is_connected = False self._consecutive_failures = 0 self._last_success_time: Optional[datetime.datetime] = None self._connection_lock = threading.Lock() logger.info( f"[监控器初始化] 实验ID={experiment_id}, 工单号={work_order_no}, " f"开始时间={start_time}, 轮询间隔={poll_interval}秒, " f"最大重试次数={max_retry_attempts}, 最大连续失败次数={max_consecutive_failures}" ) def start(self) -> None: """启动监控线程""" if self._thread is not None and self._thread.is_alive(): logger.warning(f"[监控器] 实验{self.experiment_id}的监控线程已在运行") return self._stop_event.clear() self._thread = threading.Thread(target=self._monitor_loop, daemon=True) self._thread.start() logger.info(f"[监控器] 实验{self.experiment_id}的监控线程已启动") def stop(self) -> None: """停止监控线程""" if self._thread is None: return self._stop_event.set() # 避免线程试图join自己的问题 current_thread = threading.current_thread() if self._thread.is_alive() and self._thread != current_thread: self._thread.join(timeout=5) logger.info(f"[监控器] 实验{self.experiment_id}的监控线程已停止") def _monitor_loop(self) -> None: """监控循环 - 定时查询并检测状态变化""" try: logger.info(f"[监控循环] 开始监控实验{self.experiment_id}") while not self._stop_event.is_set(): try: # 查询当前状态 logger.debug(f"[监控循环] 实验{self.experiment_id}开始查询状态...") current_state = self._query_current_state() logger.info( f"[监控循环] 实验{self.experiment_id}查询结果: 当前状态={current_state}, " f"上一状态={self._last_state}" ) if current_state is not None: # 检测状态变化 if self._last_state is None: # 第一次查询,记录初始状态 self._last_state = current_state logger.info( f"[监控循环] 实验{self.experiment_id}初始状态: {current_state}" ) elif current_state != self._last_state: # 状态发生变化 old_state = self._last_state self._last_state = current_state logger.info( f"[监控循环] 🔄 实验{self.experiment_id}状态变化: {old_state} -> {current_state}" ) # 处理状态变化 self._handle_state_change(old_state, current_state) else: logger.debug( f"[监控循环] 实验{self.experiment_id}状态无变化: {current_state}" ) else: logger.debug(f"[监控循环] 实验{self.experiment_id}暂无数据") # 等待下一个轮询周期 self._stop_event.wait(self.poll_interval) except Exception as e: logger.error(f"[监控循环] 实验{self.experiment_id}查询错误: {e}", exc_info=True) self._stop_event.wait(self.poll_interval) except Exception as e: logger.error(f"[监控循环] 实验{self.experiment_id}监控线程异常: {e}", exc_info=True) finally: logger.info(f"[监控循环] 实验{self.experiment_id}监控线程已退出") def _query_current_state(self) -> Optional[str]: """ 查询当前状态(带重试机制) Returns: 当前状态值,如果查询失败返回None """ # 查询配置验证 bucket = self.query_config.get('bucket', '') measurement = self.query_config.get('measurement', '') fields = self.query_config.get('fields', []) filters = self.query_config.get('filters', {}) status_field = self.query_config.get('status_field', '') if not all([bucket, measurement, fields, status_field]): logger.warning( f"[查询] 实验{self.experiment_id}查询配置不完整: " f"bucket={bucket}, measurement={measurement}, fields={fields}, status_field={status_field}" ) return None # 带重试的查询 for attempt in range(self.max_retry_attempts): try: # 执行查询 result = self._execute_query(bucket, measurement, fields, filters, status_field) # 查询成功,更新连接状态 self._on_query_success() return result except Exception as e: # 查询失败,记录错误 is_last_attempt = (attempt == self.max_retry_attempts - 1) if is_last_attempt: logger.error( f"[查询] 实验{self.experiment_id}查询失败(已重试{attempt + 1}次): {e}", exc_info=True ) # 更新失败状态 self._on_query_failure(str(e)) else: # 计算退避时间 backoff_time = self.retry_backoff_base ** attempt logger.warning( f"[查询] 实验{self.experiment_id}查询失败(第{attempt + 1}次尝试)," f"{backoff_time:.1f}秒后重试: {e}" ) # 等待后重试 time.sleep(backoff_time) return None def _execute_query(self, bucket: str, measurement: str, fields: list, filters: dict, status_field: str) -> Optional[str]: """ 执行单次查询(不含重试逻辑) Returns: 当前状态值,如果查询结果为空返回None Raises: Exception: 查询过程中的任何异常 """ # 创建服务实例 service = InfluxService(self.influx_params) # 构建时间范围:查询最近1小时的数据 time_range = "-1h" logger.debug( f"[查询] 实验{self.experiment_id}查询InfluxDB: " f"bucket={bucket}, measurement={measurement}, fields={fields}" ) # 执行查询 df = service.query( bucket=bucket, measurement=measurement, fields=fields, filters=filters, time_range=time_range ) if df.empty: logger.debug(f"[查询] 实验{self.experiment_id}查询结果为空") return None # 获取最新的状态值 if '_value' in df.columns and '_field' in df.columns: # 筛选出指定字段的数据 status_data = df[df['_field'] == status_field] if not status_data.empty: # 按时间排序,取最后一条记录 if '_time' in status_data.columns: status_data = status_data.sort_values('_time') latest_value = status_data['_value'].iloc[-1] logger.debug( f"[查询] 实验{self.experiment_id}最新状态值: {latest_value} " f"(字段: {status_field})" ) return str(latest_value) else: logger.warning( f"[查询] 实验{self.experiment_id}未找到字段'{status_field}'的数据" ) return None else: logger.warning( f"[查询] 实验{self.experiment_id}查询结果格式异常, " f"可用字段: {list(df.columns)}" ) return None def _on_query_success(self) -> None: """ 查询成功时的处理 """ with self._connection_lock: was_disconnected = not self._is_connected # 重置失败计数 self._consecutive_failures = 0 self._last_success_time = datetime.datetime.now() # 更新连接状态 if was_disconnected: self._is_connected = True elapsed = None if self._last_success_time: elapsed = (datetime.datetime.now() - self._last_success_time).total_seconds() message = f"实验{self.experiment_id}已恢复与InfluxDB的连接" logger.info(f"[连接恢复] ✅ {message}") # 触发连接状态变化回调 if self.on_connection_changed: try: self.on_connection_changed(True, message) except Exception as e: logger.error(f"[连接回调] 执行连接状态回调失败: {e}", exc_info=True) def _on_query_failure(self, error_message: str) -> None: """ 查询失败时的处理 Args: error_message: 错误信息 """ with self._connection_lock: self._consecutive_failures += 1 # 判断是否应该标记为断开连接 if self._consecutive_failures >= self.max_consecutive_failures: was_connected = self._is_connected if was_connected: self._is_connected = False message = ( f"实验{self.experiment_id}与InfluxDB连接断开 " f"(连续失败{self._consecutive_failures}次): {error_message}" ) logger.error(f"[连接断开] ❌ {message}") # 触发连接状态变化回调 if self.on_connection_changed: try: self.on_connection_changed(False, message) except Exception as e: logger.error(f"[连接回调] 执行连接状态回调失败: {e}", exc_info=True) else: logger.warning( f"[连接状态] 实验{self.experiment_id}仍处于断开状态 " f"(连续失败{self._consecutive_failures}次)" ) else: logger.warning( f"[连接状态] 实验{self.experiment_id}查询失败 " f"(连续失败{self._consecutive_failures}/{self.max_consecutive_failures}次)" ) def _handle_state_change(self, old_state: str, new_state: str) -> None: """ 处理状态变化 Args: old_state: 旧状态 new_state: 新状态 """ try: status_values = self.query_config.get('status_values', {}) start_value = status_values.get('start', '') end_value = status_values.get('end', '') logger.info( f"[状态变化] 实验{self.experiment_id}: {old_state} -> {new_state}, " f"开始值={start_value}, 结束值={end_value}" ) # 检测实验开始:状态从非开始值变为开始值 if not self._experiment_started and str(new_state) == str(start_value): self._on_experiment_started() # 检测实验结束:状态从开始值变回非开始值 elif self._experiment_started and str(new_state) == str(end_value): self._on_experiment_ended() # 触发回调 if self.on_state_changed: self.on_state_changed(old_state, new_state) except Exception as e: logger.error(f"[状态变化] 实验{self.experiment_id}处理失败: {e}", exc_info=True) def _on_experiment_started(self) -> None: """实验开始事件处理""" try: self._experiment_started = True self._start_time_recorded = datetime.datetime.now().isoformat(timespec='seconds') logger.info( f"[实验开始] 🟢 实验{self.experiment_id}已开始, " f"记录时间: {self._start_time_recorded}" ) # 更新数据库:设置start_ts self._update_experiment_start_time(self._start_time_recorded) logger.info(f"[实验开始] ✅ 实验{self.experiment_id}开始时间已记录并更新数据库") except Exception as e: logger.error(f"[实验开始] 实验{self.experiment_id}处理失败: {e}", exc_info=True) def _on_experiment_ended(self) -> None: """实验结束事件处理""" try: # 检查实验是否处于暂停状态 if self._is_experiment_paused(): logger.info( f"[实验结束] ⏸️ 实验{self.experiment_id}处于暂停状态," f"忽略状态变化,不记录结束时间" ) return self._experiment_ended = True self._end_time_recorded = datetime.datetime.now().isoformat(timespec='seconds') logger.info( f"[实验结束] 🔴 实验{self.experiment_id}已结束, " f"记录时间: {self._end_time_recorded}" ) # 更新数据库:设置end_ts self._update_experiment_end_time(self._end_time_recorded) logger.info(f"[实验结束] ✅ 实验{self.experiment_id}结束时间已记录并更新数据库") # 设置停止标志,让监控循环自然退出 # 不在这里直接调用stop(),避免线程join自己的问题 self._stop_event.set() logger.info(f"[实验结束] 🛑 已设置停止标志,监控将自动退出") except Exception as e: logger.error(f"[实验结束] 实验{self.experiment_id}处理失败: {e}", exc_info=True) def _is_experiment_paused(self) -> bool: """检查实验是否处于暂停状态""" try: from pathlib import Path db_path = Path(__file__).parent / "experiments.db" db = sqlite3.connect(str(db_path)) cur = db.cursor() cur.execute( "SELECT is_paused FROM experiments WHERE id=?", (self.experiment_id,) ) result = cur.fetchone() db.close() if result: is_paused = result[0] == 1 logger.debug(f"[暂停检查] 实验{self.experiment_id}暂停状态: {is_paused}") return is_paused else: logger.warning(f"[暂停检查] 实验{self.experiment_id}未找到记录") return False except Exception as e: logger.error( f"[暂停检查] 实验{self.experiment_id}查询失败: {e}", exc_info=True ) return False def _update_experiment_start_time(self, start_time: str) -> None: """更新实验记录的开始时间""" try: from pathlib import Path db_path = Path(__file__).parent / "experiments.db" db = sqlite3.connect(str(db_path)) cur = db.cursor() cur.execute( "UPDATE experiments SET start_ts=? WHERE id=?", (start_time, self.experiment_id) ) db.commit() db.close() logger.info( f"[数据库更新] 实验{self.experiment_id}开始时间已更新: {start_time}" ) except Exception as e: logger.error( f"[数据库更新] 实验{self.experiment_id}更新开始时间失败: {e}", exc_info=True ) def _update_experiment_end_time(self, end_time: str) -> None: """更新实验记录的结束时间,并执行动态脚本""" try: from pathlib import Path db_path = Path(__file__).parent / "experiments.db" db = sqlite3.connect(str(db_path)) cur = db.cursor() cur.execute( "UPDATE experiments SET end_ts=? WHERE id=?", (end_time, self.experiment_id) ) db.commit() db.close() logger.info( f"[数据库更新] 实验{self.experiment_id}结束时间已更新: {end_time}" ) # 实验结束后,自动执行动态脚本并保存数据 self._execute_and_save_script_data() except Exception as e: logger.error( f"[数据库更新] 实验{self.experiment_id}更新结束时间失败: {e}", exc_info=True ) def _execute_and_save_script_data(self) -> None: """执行动态脚本并保存返回数据到数据库""" try: from pathlib import Path import json logger.info(f"[脚本执行] 开始为实验{self.experiment_id}执行动态脚本") # 获取实验配置 db_path = Path(__file__).parent / "experiments.db" db = sqlite3.connect(str(db_path)) cur = db.cursor() cur.execute( "SELECT config_json, work_order_no FROM experiments WHERE id=?", (self.experiment_id,) ) result = cur.fetchone() if not result: logger.warning(f"[脚本执行] 实验{self.experiment_id}未找到配置") db.close() return config_json, work_order_no = result db.close() # 解析配置 from config_model import AppConfig from tempfile import NamedTemporaryFile with NamedTemporaryFile('w', delete=False, suffix='.json', encoding='utf-8') as tf: tf.write(config_json) snap_path = Path(tf.name) config = AppConfig.load(snap_path) # 执行脚本 from report_generator import _execute_experiment_script script_data = _execute_experiment_script(config) if script_data: # 保存脚本数据到 SQLite 数据库 script_data_json = json.dumps(script_data, ensure_ascii=False) db = sqlite3.connect(str(db_path)) cur = db.cursor() cur.execute( "UPDATE experiments SET script_data=? WHERE id=?", (script_data_json, self.experiment_id) ) db.commit() db.close() logger.info( f"[脚本执行] ✅ 实验{self.experiment_id}脚本数据已保存到 SQLite " f"(数据大小: {len(script_data_json)} 字节)" ) # 写入 SQL Server(如果配置了) self._write_to_sqlserver(script_data, work_order_no, config) else: logger.warning(f"[脚本执行] 实验{self.experiment_id}脚本未返回数据") except Exception as e: logger.error( f"[脚本执行] 实验{self.experiment_id}执行脚本失败: {e}", exc_info=True ) def _write_to_sqlserver(self, script_data: dict, work_order_no: str, config) -> None: """ 将脚本数据写入 SQL Server Args: script_data: 脚本返回的数据 work_order_no: 工单号 config: 应用配置 """ try: # 从 work_order_db_config.json 加载 SQL Server 配置 from pathlib import Path import json config_file = Path(__file__).parent / 'work_order_db_config.json' if not config_file.exists(): logger.info(f"[SQL Server] 配置文件不存在,跳过写入: {config_file}") return # 读取配置 with open(config_file, 'r', encoding='utf-8') as f: db_config = json.load(f) # 检查是否为调试模式 if db_config.get('debug_mode', False): logger.info(f"[SQL Server] 调试模式已启用,跳过实际写入") return sqlserver_config = { 'host': db_config.get('host', 'localhost'), 'port': db_config.get('port', 1433), 'database': db_config.get('database', ''), 'username': db_config.get('username', ''), 'password': db_config.get('password', ''), } # 检查必要配置 if not sqlserver_config['database']: logger.warning(f"[SQL Server] 数据库名未配置,跳过写入") return logger.info(f"[SQL Server] 开始写入实验{self.experiment_id}的数据") # 准备写入数据 from sqlserver_writer import write_script_data_to_sqlserver from convert_table_to_sqlserver_format import convert_temperature_table_to_sqlserver # 获取全局参数 global_params = {} if hasattr(config, 'globalParameters') and config.globalParameters: global_params = config.globalParameters.parameters or {} # 转换表格数据为 SQL Server 格式 write_data = convert_temperature_table_to_sqlserver(script_data, work_order_no, global_params) if not write_data: logger.warning(f"[SQL Server] 数据转换失败,跳过写入") return # 写入数据 success = write_script_data_to_sqlserver(write_data, sqlserver_config) if success: logger.info(f"[SQL Server] ✅ 实验{self.experiment_id}数据已写入 SQL Server") else: logger.warning(f"[SQL Server] ⚠️ 实验{self.experiment_id}数据写入 SQL Server 失败") except Exception as e: logger.error( f"[SQL Server] 实验{self.experiment_id}写入 SQL Server 异常: {e}", exc_info=True ) def get_status(self) -> Dict[str, Any]: """获取监控器状态""" with self._connection_lock: return { 'experiment_id': self.experiment_id, 'work_order_no': self.work_order_no, 'is_running': self._thread is not None and self._thread.is_alive(), 'is_connected': self._is_connected, 'consecutive_failures': self._consecutive_failures, 'last_success_time': self._last_success_time.isoformat() if self._last_success_time else None, 'last_state': self._last_state, 'experiment_started': self._experiment_started, 'experiment_ended': self._experiment_ended, 'start_time_recorded': self._start_time_recorded, 'end_time_recorded': self._end_time_recorded }