PCM_Report/experiment_monitor.py

684 lines
27 KiB
Python
Raw Normal View History

2025-12-11 14:32:31 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
实验状态监控服务 - 在等待状态下定时查询InfluxDB数据检测实验状态变化
"""
import threading
import time
import sqlite3
import datetime
from typing import Optional, Callable, Dict, Any
from logger import get_logger
from influx_service import InfluxService, InfluxConnectionParams
import pandas as pd
logger = get_logger()
class ExperimentStateMonitor:
"""实验状态监控器 - 定时查询InfluxDB数据并检测状态变化"""
def __init__(
self,
experiment_id: int,
work_order_no: str,
start_time: datetime.datetime,
influx_params: InfluxConnectionParams,
query_config: Dict[str, Any],
on_state_changed: Optional[Callable[[str, str], None]] = None,
on_connection_changed: Optional[Callable[[bool, str], None]] = None,
poll_interval: int = 5,
max_retry_attempts: int = 3,
retry_backoff_base: float = 2.0,
max_consecutive_failures: int = 10
):
"""
初始化监控器
Args:
experiment_id: 实验记录ID
work_order_no: 工单号
start_time: 工单填写时间查询此时间之后的数据
influx_params: InfluxDB连接参数
query_config: 查询配置包含:
- bucket: 数据桶名称
- measurement: 测量名称
- fields: 字段列表
- filters: 过滤条件
- status_field: 状态字段名称用于检测状态变化
- status_values: 状态值配置 {'start': '开始值', 'end': '结束值'}
on_state_changed: 状态变化回调函数 (old_state, new_state)
on_connection_changed: 连接状态变化回调函数 (is_connected, message)
poll_interval: 轮询间隔
max_retry_attempts: 单次查询最大重试次数
retry_backoff_base: 重试退避基数指数退避
max_consecutive_failures: 最大连续失败次数超过后报告连接断开
"""
self.experiment_id = experiment_id
self.work_order_no = work_order_no
self.start_time = start_time
self.influx_params = influx_params
self.query_config = query_config
self.on_state_changed = on_state_changed
self.on_connection_changed = on_connection_changed
self.poll_interval = poll_interval
self.max_retry_attempts = max_retry_attempts
self.retry_backoff_base = retry_backoff_base
self.max_consecutive_failures = max_consecutive_failures
self._thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._last_state: Optional[str] = None
self._experiment_started = False
self._experiment_ended = False
self._start_time_recorded: Optional[str] = None
self._end_time_recorded: Optional[str] = None
# 连接状态管理
self._is_connected = False
self._consecutive_failures = 0
self._last_success_time: Optional[datetime.datetime] = None
self._connection_lock = threading.Lock()
logger.info(
f"[监控器初始化] 实验ID={experiment_id}, 工单号={work_order_no}, "
f"开始时间={start_time}, 轮询间隔={poll_interval}秒, "
f"最大重试次数={max_retry_attempts}, 最大连续失败次数={max_consecutive_failures}"
)
def start(self) -> None:
"""启动监控线程"""
if self._thread is not None and self._thread.is_alive():
logger.warning(f"[监控器] 实验{self.experiment_id}的监控线程已在运行")
return
self._stop_event.clear()
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
logger.info(f"[监控器] 实验{self.experiment_id}的监控线程已启动")
def stop(self) -> None:
"""停止监控线程"""
if self._thread is None:
return
self._stop_event.set()
# 避免线程试图join自己的问题
current_thread = threading.current_thread()
if self._thread.is_alive() and self._thread != current_thread:
self._thread.join(timeout=5)
logger.info(f"[监控器] 实验{self.experiment_id}的监控线程已停止")
def _monitor_loop(self) -> None:
"""监控循环 - 定时查询并检测状态变化"""
try:
logger.info(f"[监控循环] 开始监控实验{self.experiment_id}")
while not self._stop_event.is_set():
try:
# 查询当前状态
logger.debug(f"[监控循环] 实验{self.experiment_id}开始查询状态...")
current_state = self._query_current_state()
logger.info(
f"[监控循环] 实验{self.experiment_id}查询结果: 当前状态={current_state}, "
f"上一状态={self._last_state}"
)
if current_state is not None:
# 检测状态变化
if self._last_state is None:
# 第一次查询,记录初始状态
self._last_state = current_state
logger.info(
f"[监控循环] 实验{self.experiment_id}初始状态: {current_state}"
)
elif current_state != self._last_state:
# 状态发生变化
old_state = self._last_state
self._last_state = current_state
logger.info(
f"[监控循环] 🔄 实验{self.experiment_id}状态变化: {old_state} -> {current_state}"
)
# 处理状态变化
self._handle_state_change(old_state, current_state)
else:
logger.debug(
f"[监控循环] 实验{self.experiment_id}状态无变化: {current_state}"
)
else:
logger.debug(f"[监控循环] 实验{self.experiment_id}暂无数据")
# 等待下一个轮询周期
self._stop_event.wait(self.poll_interval)
except Exception as e:
logger.error(f"[监控循环] 实验{self.experiment_id}查询错误: {e}", exc_info=True)
self._stop_event.wait(self.poll_interval)
except Exception as e:
logger.error(f"[监控循环] 实验{self.experiment_id}监控线程异常: {e}", exc_info=True)
finally:
logger.info(f"[监控循环] 实验{self.experiment_id}监控线程已退出")
def _query_current_state(self) -> Optional[str]:
"""
查询当前状态带重试机制
Returns:
当前状态值如果查询失败返回None
"""
# 查询配置验证
bucket = self.query_config.get('bucket', '')
measurement = self.query_config.get('measurement', '')
fields = self.query_config.get('fields', [])
filters = self.query_config.get('filters', {})
status_field = self.query_config.get('status_field', '')
if not all([bucket, measurement, fields, status_field]):
logger.warning(
f"[查询] 实验{self.experiment_id}查询配置不完整: "
f"bucket={bucket}, measurement={measurement}, fields={fields}, status_field={status_field}"
)
return None
# 带重试的查询
for attempt in range(self.max_retry_attempts):
try:
# 执行查询
result = self._execute_query(bucket, measurement, fields, filters, status_field)
# 查询成功,更新连接状态
self._on_query_success()
return result
except Exception as e:
# 查询失败,记录错误
is_last_attempt = (attempt == self.max_retry_attempts - 1)
if is_last_attempt:
logger.error(
f"[查询] 实验{self.experiment_id}查询失败(已重试{attempt + 1}次): {e}",
exc_info=True
)
# 更新失败状态
self._on_query_failure(str(e))
else:
# 计算退避时间
backoff_time = self.retry_backoff_base ** attempt
logger.warning(
f"[查询] 实验{self.experiment_id}查询失败(第{attempt + 1}次尝试),"
f"{backoff_time:.1f}秒后重试: {e}"
)
# 等待后重试
time.sleep(backoff_time)
return None
def _execute_query(self, bucket: str, measurement: str, fields: list,
filters: dict, status_field: str) -> Optional[str]:
"""
执行单次查询不含重试逻辑
Returns:
当前状态值如果查询结果为空返回None
Raises:
Exception: 查询过程中的任何异常
"""
# 创建服务实例
service = InfluxService(self.influx_params)
# 构建时间范围查询最近1小时的数据
time_range = "-1h"
logger.debug(
f"[查询] 实验{self.experiment_id}查询InfluxDB: "
f"bucket={bucket}, measurement={measurement}, fields={fields}"
)
# 执行查询
df = service.query(
bucket=bucket,
measurement=measurement,
fields=fields,
filters=filters,
time_range=time_range
)
if df.empty:
logger.debug(f"[查询] 实验{self.experiment_id}查询结果为空")
return None
# 获取最新的状态值
if '_value' in df.columns and '_field' in df.columns:
# 筛选出指定字段的数据
status_data = df[df['_field'] == status_field]
if not status_data.empty:
# 按时间排序,取最后一条记录
if '_time' in status_data.columns:
status_data = status_data.sort_values('_time')
latest_value = status_data['_value'].iloc[-1]
logger.debug(
f"[查询] 实验{self.experiment_id}最新状态值: {latest_value} "
f"(字段: {status_field})"
)
return str(latest_value)
else:
logger.warning(
f"[查询] 实验{self.experiment_id}未找到字段'{status_field}'的数据"
)
return None
else:
logger.warning(
f"[查询] 实验{self.experiment_id}查询结果格式异常, "
f"可用字段: {list(df.columns)}"
)
return None
def _on_query_success(self) -> None:
"""
查询成功时的处理
"""
with self._connection_lock:
was_disconnected = not self._is_connected
# 重置失败计数
self._consecutive_failures = 0
self._last_success_time = datetime.datetime.now()
# 更新连接状态
if was_disconnected:
self._is_connected = True
elapsed = None
if self._last_success_time:
elapsed = (datetime.datetime.now() - self._last_success_time).total_seconds()
message = f"实验{self.experiment_id}已恢复与InfluxDB的连接"
logger.info(f"[连接恢复] ✅ {message}")
# 触发连接状态变化回调
if self.on_connection_changed:
try:
self.on_connection_changed(True, message)
except Exception as e:
logger.error(f"[连接回调] 执行连接状态回调失败: {e}", exc_info=True)
def _on_query_failure(self, error_message: str) -> None:
"""
查询失败时的处理
Args:
error_message: 错误信息
"""
with self._connection_lock:
self._consecutive_failures += 1
# 判断是否应该标记为断开连接
if self._consecutive_failures >= self.max_consecutive_failures:
was_connected = self._is_connected
if was_connected:
self._is_connected = False
message = (
f"实验{self.experiment_id}与InfluxDB连接断开 "
f"(连续失败{self._consecutive_failures}次): {error_message}"
)
logger.error(f"[连接断开] ❌ {message}")
# 触发连接状态变化回调
if self.on_connection_changed:
try:
self.on_connection_changed(False, message)
except Exception as e:
logger.error(f"[连接回调] 执行连接状态回调失败: {e}", exc_info=True)
else:
logger.warning(
f"[连接状态] 实验{self.experiment_id}仍处于断开状态 "
f"(连续失败{self._consecutive_failures}次)"
)
else:
logger.warning(
f"[连接状态] 实验{self.experiment_id}查询失败 "
f"(连续失败{self._consecutive_failures}/{self.max_consecutive_failures}次)"
)
def _handle_state_change(self, old_state: str, new_state: str) -> None:
"""
处理状态变化
Args:
old_state: 旧状态
new_state: 新状态
"""
try:
status_values = self.query_config.get('status_values', {})
start_value = status_values.get('start', '')
end_value = status_values.get('end', '')
logger.info(
f"[状态变化] 实验{self.experiment_id}: {old_state} -> {new_state}, "
f"开始值={start_value}, 结束值={end_value}"
)
# 检测实验开始:状态从非开始值变为开始值
if not self._experiment_started and str(new_state) == str(start_value):
self._on_experiment_started()
# 检测实验结束:状态从开始值变回非开始值
elif self._experiment_started and str(new_state) == str(end_value):
self._on_experiment_ended()
# 触发回调
if self.on_state_changed:
self.on_state_changed(old_state, new_state)
except Exception as e:
logger.error(f"[状态变化] 实验{self.experiment_id}处理失败: {e}", exc_info=True)
def _on_experiment_started(self) -> None:
"""实验开始事件处理"""
try:
self._experiment_started = True
self._start_time_recorded = datetime.datetime.now().isoformat(timespec='seconds')
logger.info(
f"[实验开始] 🟢 实验{self.experiment_id}已开始, "
f"记录时间: {self._start_time_recorded}"
)
# 更新数据库设置start_ts
self._update_experiment_start_time(self._start_time_recorded)
logger.info(f"[实验开始] ✅ 实验{self.experiment_id}开始时间已记录并更新数据库")
except Exception as e:
logger.error(f"[实验开始] 实验{self.experiment_id}处理失败: {e}", exc_info=True)
def _on_experiment_ended(self) -> None:
"""实验结束事件处理"""
try:
# 检查实验是否处于暂停状态
if self._is_experiment_paused():
logger.info(
f"[实验结束] ⏸️ 实验{self.experiment_id}处于暂停状态,"
f"忽略状态变化,不记录结束时间"
)
return
self._experiment_ended = True
self._end_time_recorded = datetime.datetime.now().isoformat(timespec='seconds')
logger.info(
f"[实验结束] 🔴 实验{self.experiment_id}已结束, "
f"记录时间: {self._end_time_recorded}"
)
# 更新数据库设置end_ts
self._update_experiment_end_time(self._end_time_recorded)
logger.info(f"[实验结束] ✅ 实验{self.experiment_id}结束时间已记录并更新数据库")
# 设置停止标志,让监控循环自然退出
# 不在这里直接调用stop()避免线程join自己的问题
self._stop_event.set()
logger.info(f"[实验结束] 🛑 已设置停止标志,监控将自动退出")
except Exception as e:
logger.error(f"[实验结束] 实验{self.experiment_id}处理失败: {e}", exc_info=True)
def _is_experiment_paused(self) -> bool:
"""检查实验是否处于暂停状态"""
try:
from pathlib import Path
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"SELECT is_paused FROM experiments WHERE id=?",
(self.experiment_id,)
)
result = cur.fetchone()
db.close()
if result:
is_paused = result[0] == 1
logger.debug(f"[暂停检查] 实验{self.experiment_id}暂停状态: {is_paused}")
return is_paused
else:
logger.warning(f"[暂停检查] 实验{self.experiment_id}未找到记录")
return False
except Exception as e:
logger.error(
f"[暂停检查] 实验{self.experiment_id}查询失败: {e}",
exc_info=True
)
return False
def _update_experiment_start_time(self, start_time: str) -> None:
"""更新实验记录的开始时间"""
try:
from pathlib import Path
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"UPDATE experiments SET start_ts=? WHERE id=?",
(start_time, self.experiment_id)
)
db.commit()
db.close()
logger.info(
f"[数据库更新] 实验{self.experiment_id}开始时间已更新: {start_time}"
)
except Exception as e:
logger.error(
f"[数据库更新] 实验{self.experiment_id}更新开始时间失败: {e}",
exc_info=True
)
def _update_experiment_end_time(self, end_time: str) -> None:
"""更新实验记录的结束时间,并执行动态脚本"""
try:
from pathlib import Path
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"UPDATE experiments SET end_ts=? WHERE id=?",
(end_time, self.experiment_id)
)
db.commit()
db.close()
logger.info(
f"[数据库更新] 实验{self.experiment_id}结束时间已更新: {end_time}"
)
# 实验结束后,自动执行动态脚本并保存数据
self._execute_and_save_script_data()
except Exception as e:
logger.error(
f"[数据库更新] 实验{self.experiment_id}更新结束时间失败: {e}",
exc_info=True
)
def _execute_and_save_script_data(self) -> None:
"""执行动态脚本并保存返回数据到数据库"""
try:
from pathlib import Path
import json
logger.info(f"[脚本执行] 开始为实验{self.experiment_id}执行动态脚本")
# 获取实验配置
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"SELECT config_json, work_order_no FROM experiments WHERE id=?",
(self.experiment_id,)
)
result = cur.fetchone()
if not result:
logger.warning(f"[脚本执行] 实验{self.experiment_id}未找到配置")
db.close()
return
config_json, work_order_no = result
db.close()
# 解析配置
from config_model import AppConfig
from tempfile import NamedTemporaryFile
with NamedTemporaryFile('w', delete=False, suffix='.json', encoding='utf-8') as tf:
tf.write(config_json)
snap_path = Path(tf.name)
config = AppConfig.load(snap_path)
# 执行脚本
from report_generator import _execute_experiment_script
script_data = _execute_experiment_script(config)
if script_data:
# 保存脚本数据到 SQLite 数据库
script_data_json = json.dumps(script_data, ensure_ascii=False)
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"UPDATE experiments SET script_data=? WHERE id=?",
(script_data_json, self.experiment_id)
)
db.commit()
db.close()
logger.info(
f"[脚本执行] ✅ 实验{self.experiment_id}脚本数据已保存到 SQLite "
f"(数据大小: {len(script_data_json)} 字节)"
)
# 写入 SQL Server如果配置了
self._write_to_sqlserver(script_data, work_order_no, config)
else:
logger.warning(f"[脚本执行] 实验{self.experiment_id}脚本未返回数据")
except Exception as e:
logger.error(
f"[脚本执行] 实验{self.experiment_id}执行脚本失败: {e}",
exc_info=True
)
def _write_to_sqlserver(self, script_data: dict, work_order_no: str, config) -> None:
"""
将脚本数据写入 SQL Server
Args:
script_data: 脚本返回的数据
work_order_no: 工单号
config: 应用配置
"""
try:
# 从 work_order_db_config.json 加载 SQL Server 配置
from pathlib import Path
import json
config_file = Path(__file__).parent / 'work_order_db_config.json'
if not config_file.exists():
logger.info(f"[SQL Server] 配置文件不存在,跳过写入: {config_file}")
return
# 读取配置
with open(config_file, 'r', encoding='utf-8') as f:
db_config = json.load(f)
# 检查是否为调试模式
if db_config.get('debug_mode', False):
logger.info(f"[SQL Server] 调试模式已启用,跳过实际写入")
return
sqlserver_config = {
'host': db_config.get('host', 'localhost'),
'port': db_config.get('port', 1433),
'database': db_config.get('database', ''),
'username': db_config.get('username', ''),
'password': db_config.get('password', ''),
}
# 检查必要配置
if not sqlserver_config['database']:
logger.warning(f"[SQL Server] 数据库名未配置,跳过写入")
return
logger.info(f"[SQL Server] 开始写入实验{self.experiment_id}的数据")
# 准备写入数据
from sqlserver_writer import write_script_data_to_sqlserver
from convert_table_to_sqlserver_format import convert_temperature_table_to_sqlserver
# 获取全局参数
global_params = {}
if hasattr(config, 'globalParameters') and config.globalParameters:
global_params = config.globalParameters.parameters or {}
# 转换表格数据为 SQL Server 格式
write_data = convert_temperature_table_to_sqlserver(script_data, work_order_no, global_params)
if not write_data:
logger.warning(f"[SQL Server] 数据转换失败,跳过写入")
return
# 写入数据
success = write_script_data_to_sqlserver(write_data, sqlserver_config)
if success:
logger.info(f"[SQL Server] ✅ 实验{self.experiment_id}数据已写入 SQL Server")
else:
logger.warning(f"[SQL Server] ⚠️ 实验{self.experiment_id}数据写入 SQL Server 失败")
except Exception as e:
logger.error(
f"[SQL Server] 实验{self.experiment_id}写入 SQL Server 异常: {e}",
exc_info=True
)
def get_status(self) -> Dict[str, Any]:
"""获取监控器状态"""
with self._connection_lock:
return {
'experiment_id': self.experiment_id,
'work_order_no': self.work_order_no,
'is_running': self._thread is not None and self._thread.is_alive(),
'is_connected': self._is_connected,
'consecutive_failures': self._consecutive_failures,
'last_success_time': self._last_success_time.isoformat() if self._last_success_time else None,
'last_state': self._last_state,
'experiment_started': self._experiment_started,
'experiment_ended': self._experiment_ended,
'start_time_recorded': self._start_time_recorded,
'end_time_recorded': self._end_time_recorded
}