PCM_Report/experiment_monitor.py

684 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
实验状态监控服务 - 在等待状态下定时查询InfluxDB数据检测实验状态变化
"""
import threading
import time
import sqlite3
import datetime
from typing import Optional, Callable, Dict, Any
from logger import get_logger
from influx_service import InfluxService, InfluxConnectionParams
import pandas as pd
logger = get_logger()
class ExperimentStateMonitor:
"""实验状态监控器 - 定时查询InfluxDB数据并检测状态变化"""
def __init__(
self,
experiment_id: int,
work_order_no: str,
start_time: datetime.datetime,
influx_params: InfluxConnectionParams,
query_config: Dict[str, Any],
on_state_changed: Optional[Callable[[str, str], None]] = None,
on_connection_changed: Optional[Callable[[bool, str], None]] = None,
poll_interval: int = 5,
max_retry_attempts: int = 3,
retry_backoff_base: float = 2.0,
max_consecutive_failures: int = 10
):
"""
初始化监控器
Args:
experiment_id: 实验记录ID
work_order_no: 工单号
start_time: 工单填写时间(查询此时间之后的数据)
influx_params: InfluxDB连接参数
query_config: 查询配置,包含:
- bucket: 数据桶名称
- measurement: 测量名称
- fields: 字段列表
- filters: 过滤条件
- status_field: 状态字段名称(用于检测状态变化)
- status_values: 状态值配置 {'start': '开始值', 'end': '结束值'}
on_state_changed: 状态变化回调函数 (old_state, new_state)
on_connection_changed: 连接状态变化回调函数 (is_connected, message)
poll_interval: 轮询间隔(秒)
max_retry_attempts: 单次查询最大重试次数
retry_backoff_base: 重试退避基数(指数退避)
max_consecutive_failures: 最大连续失败次数(超过后报告连接断开)
"""
self.experiment_id = experiment_id
self.work_order_no = work_order_no
self.start_time = start_time
self.influx_params = influx_params
self.query_config = query_config
self.on_state_changed = on_state_changed
self.on_connection_changed = on_connection_changed
self.poll_interval = poll_interval
self.max_retry_attempts = max_retry_attempts
self.retry_backoff_base = retry_backoff_base
self.max_consecutive_failures = max_consecutive_failures
self._thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._last_state: Optional[str] = None
self._experiment_started = False
self._experiment_ended = False
self._start_time_recorded: Optional[str] = None
self._end_time_recorded: Optional[str] = None
# 连接状态管理
self._is_connected = False
self._consecutive_failures = 0
self._last_success_time: Optional[datetime.datetime] = None
self._connection_lock = threading.Lock()
logger.info(
f"[监控器初始化] 实验ID={experiment_id}, 工单号={work_order_no}, "
f"开始时间={start_time}, 轮询间隔={poll_interval}秒, "
f"最大重试次数={max_retry_attempts}, 最大连续失败次数={max_consecutive_failures}"
)
def start(self) -> None:
"""启动监控线程"""
if self._thread is not None and self._thread.is_alive():
logger.warning(f"[监控器] 实验{self.experiment_id}的监控线程已在运行")
return
self._stop_event.clear()
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
logger.info(f"[监控器] 实验{self.experiment_id}的监控线程已启动")
def stop(self) -> None:
"""停止监控线程"""
if self._thread is None:
return
self._stop_event.set()
# 避免线程试图join自己的问题
current_thread = threading.current_thread()
if self._thread.is_alive() and self._thread != current_thread:
self._thread.join(timeout=5)
logger.info(f"[监控器] 实验{self.experiment_id}的监控线程已停止")
def _monitor_loop(self) -> None:
"""监控循环 - 定时查询并检测状态变化"""
try:
logger.info(f"[监控循环] 开始监控实验{self.experiment_id}")
while not self._stop_event.is_set():
try:
# 查询当前状态
logger.debug(f"[监控循环] 实验{self.experiment_id}开始查询状态...")
current_state = self._query_current_state()
logger.info(
f"[监控循环] 实验{self.experiment_id}查询结果: 当前状态={current_state}, "
f"上一状态={self._last_state}"
)
if current_state is not None:
# 检测状态变化
if self._last_state is None:
# 第一次查询,记录初始状态
self._last_state = current_state
logger.info(
f"[监控循环] 实验{self.experiment_id}初始状态: {current_state}"
)
elif current_state != self._last_state:
# 状态发生变化
old_state = self._last_state
self._last_state = current_state
logger.info(
f"[监控循环] 🔄 实验{self.experiment_id}状态变化: {old_state} -> {current_state}"
)
# 处理状态变化
self._handle_state_change(old_state, current_state)
else:
logger.debug(
f"[监控循环] 实验{self.experiment_id}状态无变化: {current_state}"
)
else:
logger.debug(f"[监控循环] 实验{self.experiment_id}暂无数据")
# 等待下一个轮询周期
self._stop_event.wait(self.poll_interval)
except Exception as e:
logger.error(f"[监控循环] 实验{self.experiment_id}查询错误: {e}", exc_info=True)
self._stop_event.wait(self.poll_interval)
except Exception as e:
logger.error(f"[监控循环] 实验{self.experiment_id}监控线程异常: {e}", exc_info=True)
finally:
logger.info(f"[监控循环] 实验{self.experiment_id}监控线程已退出")
def _query_current_state(self) -> Optional[str]:
"""
查询当前状态(带重试机制)
Returns:
当前状态值如果查询失败返回None
"""
# 查询配置验证
bucket = self.query_config.get('bucket', '')
measurement = self.query_config.get('measurement', '')
fields = self.query_config.get('fields', [])
filters = self.query_config.get('filters', {})
status_field = self.query_config.get('status_field', '')
if not all([bucket, measurement, fields, status_field]):
logger.warning(
f"[查询] 实验{self.experiment_id}查询配置不完整: "
f"bucket={bucket}, measurement={measurement}, fields={fields}, status_field={status_field}"
)
return None
# 带重试的查询
for attempt in range(self.max_retry_attempts):
try:
# 执行查询
result = self._execute_query(bucket, measurement, fields, filters, status_field)
# 查询成功,更新连接状态
self._on_query_success()
return result
except Exception as e:
# 查询失败,记录错误
is_last_attempt = (attempt == self.max_retry_attempts - 1)
if is_last_attempt:
logger.error(
f"[查询] 实验{self.experiment_id}查询失败(已重试{attempt + 1}次): {e}",
exc_info=True
)
# 更新失败状态
self._on_query_failure(str(e))
else:
# 计算退避时间
backoff_time = self.retry_backoff_base ** attempt
logger.warning(
f"[查询] 实验{self.experiment_id}查询失败(第{attempt + 1}次尝试),"
f"{backoff_time:.1f}秒后重试: {e}"
)
# 等待后重试
time.sleep(backoff_time)
return None
def _execute_query(self, bucket: str, measurement: str, fields: list,
filters: dict, status_field: str) -> Optional[str]:
"""
执行单次查询(不含重试逻辑)
Returns:
当前状态值如果查询结果为空返回None
Raises:
Exception: 查询过程中的任何异常
"""
# 创建服务实例
service = InfluxService(self.influx_params)
# 构建时间范围查询最近1小时的数据
time_range = "-1h"
logger.debug(
f"[查询] 实验{self.experiment_id}查询InfluxDB: "
f"bucket={bucket}, measurement={measurement}, fields={fields}"
)
# 执行查询
df = service.query(
bucket=bucket,
measurement=measurement,
fields=fields,
filters=filters,
time_range=time_range
)
if df.empty:
logger.debug(f"[查询] 实验{self.experiment_id}查询结果为空")
return None
# 获取最新的状态值
if '_value' in df.columns and '_field' in df.columns:
# 筛选出指定字段的数据
status_data = df[df['_field'] == status_field]
if not status_data.empty:
# 按时间排序,取最后一条记录
if '_time' in status_data.columns:
status_data = status_data.sort_values('_time')
latest_value = status_data['_value'].iloc[-1]
logger.debug(
f"[查询] 实验{self.experiment_id}最新状态值: {latest_value} "
f"(字段: {status_field})"
)
return str(latest_value)
else:
logger.warning(
f"[查询] 实验{self.experiment_id}未找到字段'{status_field}'的数据"
)
return None
else:
logger.warning(
f"[查询] 实验{self.experiment_id}查询结果格式异常, "
f"可用字段: {list(df.columns)}"
)
return None
def _on_query_success(self) -> None:
"""
查询成功时的处理
"""
with self._connection_lock:
was_disconnected = not self._is_connected
# 重置失败计数
self._consecutive_failures = 0
self._last_success_time = datetime.datetime.now()
# 更新连接状态
if was_disconnected:
self._is_connected = True
elapsed = None
if self._last_success_time:
elapsed = (datetime.datetime.now() - self._last_success_time).total_seconds()
message = f"实验{self.experiment_id}已恢复与InfluxDB的连接"
logger.info(f"[连接恢复] ✅ {message}")
# 触发连接状态变化回调
if self.on_connection_changed:
try:
self.on_connection_changed(True, message)
except Exception as e:
logger.error(f"[连接回调] 执行连接状态回调失败: {e}", exc_info=True)
def _on_query_failure(self, error_message: str) -> None:
"""
查询失败时的处理
Args:
error_message: 错误信息
"""
with self._connection_lock:
self._consecutive_failures += 1
# 判断是否应该标记为断开连接
if self._consecutive_failures >= self.max_consecutive_failures:
was_connected = self._is_connected
if was_connected:
self._is_connected = False
message = (
f"实验{self.experiment_id}与InfluxDB连接断开 "
f"(连续失败{self._consecutive_failures}次): {error_message}"
)
logger.error(f"[连接断开] ❌ {message}")
# 触发连接状态变化回调
if self.on_connection_changed:
try:
self.on_connection_changed(False, message)
except Exception as e:
logger.error(f"[连接回调] 执行连接状态回调失败: {e}", exc_info=True)
else:
logger.warning(
f"[连接状态] 实验{self.experiment_id}仍处于断开状态 "
f"(连续失败{self._consecutive_failures}次)"
)
else:
logger.warning(
f"[连接状态] 实验{self.experiment_id}查询失败 "
f"(连续失败{self._consecutive_failures}/{self.max_consecutive_failures}次)"
)
def _handle_state_change(self, old_state: str, new_state: str) -> None:
"""
处理状态变化
Args:
old_state: 旧状态
new_state: 新状态
"""
try:
status_values = self.query_config.get('status_values', {})
start_value = status_values.get('start', '')
end_value = status_values.get('end', '')
logger.info(
f"[状态变化] 实验{self.experiment_id}: {old_state} -> {new_state}, "
f"开始值={start_value}, 结束值={end_value}"
)
# 检测实验开始:状态从非开始值变为开始值
if not self._experiment_started and str(new_state) == str(start_value):
self._on_experiment_started()
# 检测实验结束:状态从开始值变回非开始值
elif self._experiment_started and str(new_state) == str(end_value):
self._on_experiment_ended()
# 触发回调
if self.on_state_changed:
self.on_state_changed(old_state, new_state)
except Exception as e:
logger.error(f"[状态变化] 实验{self.experiment_id}处理失败: {e}", exc_info=True)
def _on_experiment_started(self) -> None:
"""实验开始事件处理"""
try:
self._experiment_started = True
self._start_time_recorded = datetime.datetime.now().isoformat(timespec='seconds')
logger.info(
f"[实验开始] 🟢 实验{self.experiment_id}已开始, "
f"记录时间: {self._start_time_recorded}"
)
# 更新数据库设置start_ts
self._update_experiment_start_time(self._start_time_recorded)
logger.info(f"[实验开始] ✅ 实验{self.experiment_id}开始时间已记录并更新数据库")
except Exception as e:
logger.error(f"[实验开始] 实验{self.experiment_id}处理失败: {e}", exc_info=True)
def _on_experiment_ended(self) -> None:
"""实验结束事件处理"""
try:
# 检查实验是否处于暂停状态
if self._is_experiment_paused():
logger.info(
f"[实验结束] ⏸️ 实验{self.experiment_id}处于暂停状态,"
f"忽略状态变化,不记录结束时间"
)
return
self._experiment_ended = True
self._end_time_recorded = datetime.datetime.now().isoformat(timespec='seconds')
logger.info(
f"[实验结束] 🔴 实验{self.experiment_id}已结束, "
f"记录时间: {self._end_time_recorded}"
)
# 更新数据库设置end_ts
self._update_experiment_end_time(self._end_time_recorded)
logger.info(f"[实验结束] ✅ 实验{self.experiment_id}结束时间已记录并更新数据库")
# 设置停止标志,让监控循环自然退出
# 不在这里直接调用stop()避免线程join自己的问题
self._stop_event.set()
logger.info(f"[实验结束] 🛑 已设置停止标志,监控将自动退出")
except Exception as e:
logger.error(f"[实验结束] 实验{self.experiment_id}处理失败: {e}", exc_info=True)
def _is_experiment_paused(self) -> bool:
"""检查实验是否处于暂停状态"""
try:
from pathlib import Path
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"SELECT is_paused FROM experiments WHERE id=?",
(self.experiment_id,)
)
result = cur.fetchone()
db.close()
if result:
is_paused = result[0] == 1
logger.debug(f"[暂停检查] 实验{self.experiment_id}暂停状态: {is_paused}")
return is_paused
else:
logger.warning(f"[暂停检查] 实验{self.experiment_id}未找到记录")
return False
except Exception as e:
logger.error(
f"[暂停检查] 实验{self.experiment_id}查询失败: {e}",
exc_info=True
)
return False
def _update_experiment_start_time(self, start_time: str) -> None:
"""更新实验记录的开始时间"""
try:
from pathlib import Path
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"UPDATE experiments SET start_ts=? WHERE id=?",
(start_time, self.experiment_id)
)
db.commit()
db.close()
logger.info(
f"[数据库更新] 实验{self.experiment_id}开始时间已更新: {start_time}"
)
except Exception as e:
logger.error(
f"[数据库更新] 实验{self.experiment_id}更新开始时间失败: {e}",
exc_info=True
)
def _update_experiment_end_time(self, end_time: str) -> None:
"""更新实验记录的结束时间,并执行动态脚本"""
try:
from pathlib import Path
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"UPDATE experiments SET end_ts=? WHERE id=?",
(end_time, self.experiment_id)
)
db.commit()
db.close()
logger.info(
f"[数据库更新] 实验{self.experiment_id}结束时间已更新: {end_time}"
)
# 实验结束后,自动执行动态脚本并保存数据
self._execute_and_save_script_data()
except Exception as e:
logger.error(
f"[数据库更新] 实验{self.experiment_id}更新结束时间失败: {e}",
exc_info=True
)
def _execute_and_save_script_data(self) -> None:
"""执行动态脚本并保存返回数据到数据库"""
try:
from pathlib import Path
import json
logger.info(f"[脚本执行] 开始为实验{self.experiment_id}执行动态脚本")
# 获取实验配置
db_path = Path(__file__).parent / "experiments.db"
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"SELECT config_json, work_order_no FROM experiments WHERE id=?",
(self.experiment_id,)
)
result = cur.fetchone()
if not result:
logger.warning(f"[脚本执行] 实验{self.experiment_id}未找到配置")
db.close()
return
config_json, work_order_no = result
db.close()
# 解析配置
from config_model import AppConfig
from tempfile import NamedTemporaryFile
with NamedTemporaryFile('w', delete=False, suffix='.json', encoding='utf-8') as tf:
tf.write(config_json)
snap_path = Path(tf.name)
config = AppConfig.load(snap_path)
# 执行脚本
from report_generator import _execute_experiment_script
script_data = _execute_experiment_script(config)
if script_data:
# 保存脚本数据到 SQLite 数据库
script_data_json = json.dumps(script_data, ensure_ascii=False)
db = sqlite3.connect(str(db_path))
cur = db.cursor()
cur.execute(
"UPDATE experiments SET script_data=? WHERE id=?",
(script_data_json, self.experiment_id)
)
db.commit()
db.close()
logger.info(
f"[脚本执行] ✅ 实验{self.experiment_id}脚本数据已保存到 SQLite "
f"(数据大小: {len(script_data_json)} 字节)"
)
# 写入 SQL Server如果配置了
self._write_to_sqlserver(script_data, work_order_no, config)
else:
logger.warning(f"[脚本执行] 实验{self.experiment_id}脚本未返回数据")
except Exception as e:
logger.error(
f"[脚本执行] 实验{self.experiment_id}执行脚本失败: {e}",
exc_info=True
)
def _write_to_sqlserver(self, script_data: dict, work_order_no: str, config) -> None:
"""
将脚本数据写入 SQL Server
Args:
script_data: 脚本返回的数据
work_order_no: 工单号
config: 应用配置
"""
try:
# 从 work_order_db_config.json 加载 SQL Server 配置
from pathlib import Path
import json
config_file = Path(__file__).parent / 'work_order_db_config.json'
if not config_file.exists():
logger.info(f"[SQL Server] 配置文件不存在,跳过写入: {config_file}")
return
# 读取配置
with open(config_file, 'r', encoding='utf-8') as f:
db_config = json.load(f)
# 检查是否为调试模式
if db_config.get('debug_mode', False):
logger.info(f"[SQL Server] 调试模式已启用,跳过实际写入")
return
sqlserver_config = {
'host': db_config.get('host', 'localhost'),
'port': db_config.get('port', 1433),
'database': db_config.get('database', ''),
'username': db_config.get('username', ''),
'password': db_config.get('password', ''),
}
# 检查必要配置
if not sqlserver_config['database']:
logger.warning(f"[SQL Server] 数据库名未配置,跳过写入")
return
logger.info(f"[SQL Server] 开始写入实验{self.experiment_id}的数据")
# 准备写入数据
from sqlserver_writer import write_script_data_to_sqlserver
from convert_table_to_sqlserver_format import convert_temperature_table_to_sqlserver
# 获取全局参数
global_params = {}
if hasattr(config, 'globalParameters') and config.globalParameters:
global_params = config.globalParameters.parameters or {}
# 转换表格数据为 SQL Server 格式
write_data = convert_temperature_table_to_sqlserver(script_data, work_order_no, global_params)
if not write_data:
logger.warning(f"[SQL Server] 数据转换失败,跳过写入")
return
# 写入数据
success = write_script_data_to_sqlserver(write_data, sqlserver_config)
if success:
logger.info(f"[SQL Server] ✅ 实验{self.experiment_id}数据已写入 SQL Server")
else:
logger.warning(f"[SQL Server] ⚠️ 实验{self.experiment_id}数据写入 SQL Server 失败")
except Exception as e:
logger.error(
f"[SQL Server] 实验{self.experiment_id}写入 SQL Server 异常: {e}",
exc_info=True
)
def get_status(self) -> Dict[str, Any]:
"""获取监控器状态"""
with self._connection_lock:
return {
'experiment_id': self.experiment_id,
'work_order_no': self.work_order_no,
'is_running': self._thread is not None and self._thread.is_alive(),
'is_connected': self._is_connected,
'consecutive_failures': self._consecutive_failures,
'last_success_time': self._last_success_time.isoformat() if self._last_success_time else None,
'last_state': self._last_state,
'experiment_started': self._experiment_started,
'experiment_ended': self._experiment_ended,
'start_time_recorded': self._start_time_recorded,
'end_time_recorded': self._end_time_recorded
}