-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.


local singleton = require 'mc.singleton'
local log = require 'mc.logging'
local c_exp_board = require 'unit_manager.class.unit.exu.exp_board'
local c_cpu_board = require 'unit_manager.class.unit.bcu.cpu_board'
local c_npu_board = require 'unit_manager.class.unit.acu.npu_board'
local c_riser_card = require 'unit_manager.class.unit.ieu.riser_card'
local c_hdd_backplane = require 'unit_manager.class.unit.seu.hdd_backplane'
local c_fan_board = require 'unit_manager.class.unit.clu.fan_board'
local c_psu_board = require 'unit_manager.class.unit.psu.psu_board'
local c_peu_board = require 'unit_manager.class.unit.peu.peu_board'
local c_logic_fw = require 'unit_manager.class.logic_fw.fw_init'
local c_m2_transform_card = require 'unit_manager.class.unit.seu.m2_transform_card'
local cmn = require 'common'
local file_sec = require 'utils.file'
local utils = require 'mc.utils'
local utils_core = require 'utils.core'
local fructl = require 'mcu.upgrade.fructl_handler'
local log_collector = require 'log_collector'
local fpga_signal = require 'unit_manager.class.logic_fw.fpga.fpga_signal'
local ipmi = require 'ipmi'
local imu_cmd = require 'imu_cmd'
local skynet = require 'skynet'
local defs = require 'unit_manager.class.logic_fw.comm_defs'
local base_messages = require 'messages.base'
local NPU_REGION_TYPE<const> = 2
local comp_code = ipmi.types.Cc

local RESET_INDEX<const> = 1
local RESET<const> = 0
local UNRESET<const> = 1
local LOCK_INDEX<const> = 2
local RESET_UNLOCK<const> = 0
local REGION_NAME_IDNEX<const> = 3
local ALL_NPU_BOARD_NUM<const> = 255
local NPUBOARD_POWERWATTS<const> = 'npuboard.powerwatts'

local NPUBOARD_METRIC_NAME<const> = {
    NPUBOARD_POWERWATTS
}

local unit_manager = {}
unit_manager.__index = unit_manager

function unit_manager:add_expboard_obj_cb(object, position)
    local obj = c_exp_board.new(self.bus, object, position, self.db)
    table.insert(self.exp_boards, obj)
    self:task_set_ref_mcu_obj(obj)
    table.insert(self.exp_board_position, position)
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
    if obj:get_prop('SyncTimeSupport') == 1 then
        cmn.skynet.fork(function()
            while true do
                obj:task_update_exp_board_time()
                cmn.skynet.sleep(30000)  -- 扫描周期300s
            end
        end)
    end
end

function unit_manager:delete_expboard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.exp_boards, position)
    cmn.remove_ele_by_value(self.exp_board_position, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_cpuboard_obj_cb(object, position)
    local obj = c_cpu_board.new(self.bus, object, position, self.db)
    table.insert(self.cpu_boards, obj)
    self:task_set_ref_mcu_obj(obj)
    table.insert(self.cpu_board_position, position)
    obj:task_update()
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
end

function unit_manager:delete_cpuboard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.cpu_boards, position)
    cmn.remove_ele_by_value(self.cpu_board_position, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_npuboard_obj_cb(object, position)
    local obj = c_npu_board.new(self.bus, object, position, self.db)
    self.npu_board_map[object.path] = obj
    table.insert(self.npu_boards, obj)
    self:task_set_ref_mcu_obj(obj)
    obj:task_update()
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
    cmn.skynet.fork(function()
        while true do
            obj:update_npuboard_time()
            cmn.skynet.sleep(6000)  -- 6000 扫描周期60s
        end
    end)
    cmn.skynet.fork(function()
        obj:update_npuboard_health()
    end)
    -- NPU模组SMC通信检测任务，记录维护日志
    cmn.skynet.fork(function()
        while true do
            obj:check_mcu_i2c_communication()
            cmn.skynet.sleep(1000)  -- 100 扫描周期10s
        end
    end)
end

function unit_manager:delete_npuboard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.npu_boards, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_hddbackplane_obj_cb(object, position)
    local obj = c_hdd_backplane.new(self.bus, object, position, self.db)
    table.insert(self.hdd_backplanes, obj)
    obj:update_slot_info(self.board_slot_configs)
    obj:update_start_slot(self.customize_sign)
    self:task_set_ref_mcu_obj(obj)
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
end

function unit_manager:delete_hddbackplane_obj_cb(object, position)
    cmn.remove_ele_by_position(self.hdd_backplanes, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_fanboard_obj_cb(object, position)
    local obj = c_fan_board.new(self.bus, object, position, self.db)
    table.insert(self.fan_boards, obj)
    self:task_set_ref_mcu_obj(obj)
    obj:task_update()
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
end

function unit_manager:delete_fanboard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.fan_boards, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_psuboard_obj_cb(object, position)
    local obj = c_psu_board.new(self.bus, object, position, self.db)
    table.insert(self.psu_boards, obj)
    self:task_set_ref_mcu_obj(obj)
    obj:task_update()
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
end

function unit_manager:delete_psuboard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.psu_boards, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_peuboard_obj_cb(object, position)
    local obj = c_peu_board.new(self.bus, object, position, self.db)
    table.insert(self.peu_boards, obj)
    obj:task_update()
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
end

function unit_manager:delete_peuboard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.peu_boards, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_risercard_obj_cb(object, position)
    local obj = c_riser_card.new(self.bus, object, position, self.db)
    table.insert(self.riser_cards, obj)
    self:task_set_ref_mcu_obj(obj)
    obj:task_update()
    -- 保持与cli一致，device中PCIeRiser改为Riser
    self.device_names[position] = string.sub(obj:get_prop('DeviceName'), 5)
    self.unit_collection[position] = obj
end

function unit_manager:delete_risercard_obj_cb(object, position)
    cmn.remove_ele_by_position(self.riser_cards, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:add_m2_transform_card_obj_cb(object, position)
    local obj = c_m2_transform_card.new(self.bus, object, position, self.db)
    table.insert(self.m2_transform_cards, obj)
    self:task_set_ref_mcu_obj(obj)
    obj:task_update()
    self.device_names[position] = obj:get_prop('DeviceName')
    self.unit_collection[position] = obj
end

function unit_manager:delete_m2_transform_card_obj_cb(object, position)
    cmn.remove_ele_by_position(self.m2_transform_cards, position)
    self.device_names[position] = nil
    self.unit_collection[position] = nil
end

function unit_manager:on_add_object(class_name, object, position)
    local switch = {
        ['ExpBoard'] = function ()
            self:add_expboard_obj_cb(object, position)
        end,
        ['CpuBoard'] = function ()
            self:add_cpuboard_obj_cb(object, position)
        end,
        ['NpuBoard'] = function ()
            self:add_npuboard_obj_cb(object, position)
        end,
        ['M2TransferCard'] = function ()
            self:add_m2_transform_card_obj_cb(object, position)
        end,
        ['HddBackplane'] = function ()
            self:add_hddbackplane_obj_cb(object, position)
        end,
        ['FanBoard'] = function ()
            self:add_fanboard_obj_cb(object, position)
        end,
        ['PsuBoard'] = function ()
            self:add_psuboard_obj_cb(object, position)
        end,
        ['PeuBoard'] = function ()
            self:add_peuboard_obj_cb(object, position)
        end,
        ['RiserCard'] = function ()
            self:add_risercard_obj_cb(object, position)
        end,
        ['BoardSlotConfig'] = function ()
            local obj = {mds_obj = object, position = position}
            table.insert(self.board_slot_configs, obj)
            self.psr_position = position
        end,
        ['LogicFirmware'] = function ()
            cmn.skynet.fork(function ()
                local device_name = self:get_device_name(position)
                local obj = c_logic_fw.new(object, position, self.bus, device_name)
                table.insert(self.logic_fw, obj)
                obj:register_logic_fw_info()
            end)
        end,
        ['CpldReset'] = function ()
            self.cpld_reset = object
        end
    }

    if switch[class_name] then
        switch[class_name]()
        log:notice('[UnitManager] Add object, class: %s, position: %s', class_name, position)
    end
end

function unit_manager:on_delete_object(class_name, object, position)
    local switch = {
        ['ExpBoard'] = function ()
            self:delete_expboard_obj_cb(object, position)
        end,
        ['CpuBoard'] = function ()
            self:delete_cpuboard_obj_cb(object, position)
        end,
        ['NpuBoard'] = function ()
            self:delete_npuboard_obj_cb(object, position)
        end,
        ['HddBackplane'] = function ()
            self:delete_hddbackplane_obj_cb(object, position)
        end,
        ['M2TransferCard'] = function ()
            self:delete_m2_transform_card_obj_cb(object, position)
        end,
        ['FanBoard'] = function ()
            self:delete_fanboard_obj_cb(object, position)
        end,
        ['PsuBoard'] = function ()
            self:delete_psuboard_obj_cb(object, position)
        end,
        ['PeuBoard'] = function ()
            self:delete_peuboard_obj_cb(object, position)
        end,
        ['RiserCard'] = function ()
            self:delete_risercard_obj_cb(object, position)
        end,
        ['BoardSlotConfig'] = function ()
            cmn.remove_ele_by_position(self.board_slot_configs, position)
        end,
        ['LogicFirmware'] = function ()
            self:unregister_cpld_firmware_info(position)
            cmn.remove_ele_by_position(self.logic_fw, position)
        end,
        ['CpldReset'] = function ()
            self.cpld_reset = nil
        end
    }

    if switch[class_name] then
        switch[class_name]()
        log:notice('[UnitManager] Delete object, class: %s, position: %s', class_name, position)
    end
end

local function get_valid_fw(manager, pos, unit_name)
    local result = {}
    local fw_name = "^" .. unit_name .. "_CPLD" -- 逻辑器件名称不为XXX_CPLD时无法匹配时会在on_add_object阶段处理
    for _, v in pairs(manager.logic_fw) do
        if v.position == pos and string.match(v.csr.Name, fw_name) then -- on_add_object阶段中匹配用的fw_info.id引用了该Name值
            table.insert(result, v)
        end
    end
    return result
end

local function is_find_fpga_fw(manager, pos)
    for _, v in pairs(manager.logic_fw) do
        if v.position == pos and string.find(v.csr.Name, 'FPGA') then
            log:notice("fpga firmware exists on the board")
            return true
        end
    end
    log:notice("fpga firmware not exists on the board")
    return false
end

function unit_manager:get_fw_list_complete(tbl)
    if not tbl or type(tbl) ~= "table" then
        return ""
    end
    local result = ""
    local cpld_type = ''
    for _, v in pairs(tbl) do
        result = string.format("%s%s,", result, v.id)
        -- 如果device_name为nil，说明Board对象晚于LogicFirmware对象至少1分钟才分发，需要更新LogicFirmware的部分属性
        if not v.device_name then
            v.device_name = self:get_device_name(v.position) or 'Unknown Board'
            cpld_type = string.match(v.csr.Name, 'FPGA') and ' FPGA' or ' CPLD'
            v.name = v.device_name .. cpld_type .. (string.match(v.csr.Name, 'CPLD(%d)') or '')
        end
    end
    return result
end

local function include_position(tbl, position)
    for _, v in pairs(tbl) do
        if v == position then
            return true
        end
    end
    return false
end

local LOCK_TIME_FIVE_MINUTE<const> = 300
function unit_manager:monitor_npuboard_vrd_upgrade_status()
    local vrd_upgrade_state = 0
    local last_state = 0
    local res
    self.npu_vrd_monitor_task = true
    while true do
        cmn.skynet.sleep(100)
        for _, obj in pairs(self.npu_boards) do
            res, vrd_upgrade_state = pcall(function()
                return obj:get_prop('VRDUpgradeState')
            end)
            if res and vrd_upgrade_state == 1 then -- 有任意一个模组状态是禁止上电，都需要禁止上电
                log:debug("vrd upgrade state is 1")
                break
            end
            vrd_upgrade_state = 0
        end
        if vrd_upgrade_state ==  last_state then
            log:debug("vrd upgrade state not change, state : %s", vrd_upgrade_state)
            goto continue
        end
        last_state =  vrd_upgrade_state
        if vrd_upgrade_state == 1 then
            fructl.set_poweron_lock_until_success(self.bus, 1, true, LOCK_TIME_FIVE_MINUTE, 'NPU_VRD')
            log:maintenance(log.MLOG_ERROR, log.FC__PUBLIC_OK, "lock power on because of NPU VRD upgrade")
        end
        if vrd_upgrade_state == 0 then
            fructl.set_poweron_lock_until_success(self.bus, 1, false, LOCK_TIME_FIVE_MINUTE, 'NPU_VRD')
            log:maintenance(log.MLOG_ERROR, log.FC__PUBLIC_OK, "unlock power on because of NPU VRD upgrade completed")
        end
        ::continue::
    end
end

function unit_manager:on_add_object_complete(position)
    -- 更新硬盘背板槽位信息
    if self.psr_position == position then
        cmn.skynet.fork(function ()
            for _, obj in pairs(self.hdd_backplanes) do
                obj:update_slot_info(self.board_slot_configs)
            end
        end)
    end

    if include_position(self.cpu_board_position, position) then
        cmn.skynet.fork(function ()
            local unit_name = 'BCU'
            local cpu_board_obj = self:get_board_obj_by_position(position)
            if cpu_board_obj then
                cpu_board_obj:update_bmc_start_flag()
            end

            cmn.skynet.sleep(500) -- 等待5秒，防止获取BCU_CPLD失败
            local fw_list = get_valid_fw(self, position, unit_name)
            log:notice("%s valid fw list : %s", unit_name, self:get_fw_list_complete(fw_list))
            -- 只有当分发完毕之后才能最终决定添加多少个升级资源
            c_logic_fw.register_Logic_fw(fw_list, unit_name)
        end)
    end

    if include_position(self.exp_board_position, position) then
        cmn.skynet.fork(function ()
            local unit_name = 'EXU'
            cmn.skynet.sleep(500) -- 等待5秒，防止获取EXU_CPLD失败
            local fw_list = get_valid_fw(self, position, unit_name)
            log:notice("%s valid fw list : %s", unit_name, self:get_fw_list_complete(fw_list))
            -- 只有当分发完毕之后才能最终决定添加多少个升级资源
            c_logic_fw.register_Logic_fw(fw_list, unit_name)
            -- 板上存在fpga固件，启动fpga初始化处理
            if is_find_fpga_fw(self, position) then
                fpga_signal.init_fpga_proc(self.device_names[position])
            end
        end)
    end

    if next(self.npu_boards) ~= nil and not self.npu_vrd_monitor_task then
        cmn.skynet.fork(function ()
            self:monitor_npuboard_vrd_upgrade_status();
        end)
    end
    if next(self.npu_boards) ~= nil and not self.npu_log_dump_mcu_error_task_flag then
        self.npu_log_dump_mcu_error_task_flag = true
        cmn.skynet.fork(function ()
            self:log_dump_mcu_error_task()
        end)
    end
end

-- 关联对应的MCU升级对象
function unit_manager:task_set_ref_mcu_obj(obj)
    cmn.skynet.fork(function ()
        local mcu_obj = nil
        local time_out = 60
        local mcu_service
        repeat
            mcu_service = self.service_manager:get_service('mcu')
            mcu_obj = mcu_service:get_obj_by_position(obj.position)
            if mcu_obj then
                obj:set_ref_mcu_obj(mcu_obj)
                log:debug('[UnitManager] %s referenced %s', obj.mds_obj.name, mcu_obj.mcu.name)
            end
            time_out = time_out - 1
            cmn.skynet.sleep(200)
        until mcu_obj or time_out < 0
    end)
end

function unit_manager.new(service_manager)
    return setmetatable({service_manager = service_manager}, unit_manager)
end

function unit_manager:get_device_name(position)
    if not position then
        return
    end
    local device_name
    local retries = 0
    repeat
        if not self.device_names[position] then
            retries = retries + 1
            cmn.skynet.sleep(300)
        else
            device_name = self.device_names[position]
        end
    until device_name or retries > 30 -- 防止固件加载比板卡快导致获取不到信息
    if not device_name then
        log:error('get device_name failed. position: %s, retry: %s', position, retries)
    end
    return device_name
end

function unit_manager:update_npu_logs()
    -- 如果已经触发收集则不再收集
    if self.npu_collecting_count == 0 then
        self.npu_collecting_count = #self.npu_boards
        for _, obj in pairs(self.npu_boards) do
            cmn.skynet.fork(function ()
                obj:update_dump_log()
                self.npu_collecting_count = self.npu_collecting_count - 1
            end)
        end
    end
end

-- 启动监测NPU模组MCU异常触发日志收集任务
function unit_manager:log_dump_mcu_error_task()
    -- 等待360秒，确保所有NPUBoard对象加载完成
    cmn.skynet.sleep(36000)
    log:notice('log_dump_mcu_error_task npu_boards total number:%s', #self.npu_boards)
    for _, obj in pairs(self.npu_boards) do
        self:monitor_collect_mcu_log_flag(obj)
    end
    -- 每10s轮询检测模组MCU是否需要再次触发收集任务
    skynet.fork_loop({count = 0}, function()
        while true do
            self:monitor_need_collect_mcu()
            cmn.skynet.sleep(1000)
        end
    end)
end

-- 监测指定NpuBoard的MCU日志收集
function unit_manager:monitor_collect_mcu_log_flag(obj)
    local slot = obj:get_prop('Slot')
    log:notice('start monitor NpuBoard%s CollectMCULogFlag:%s', slot, obj:get_prop('CollectMCULogFlag'))
    -- 监听CollectMCULogFlag的变化
    obj.mds_obj.property_changed:on(function(name, value)
        if name == 'CollectMCULogFlag' and value ~= 0 then
            log:notice('receive NpuBoard%s mcu error signature:%s', slot, value)
            self:update_npu_log(obj)
        end
    end)
end

-- 监测是否需要再次触发MCU日志收集任务
function unit_manager:monitor_need_collect_mcu()
    for _, obj in pairs(self.npu_boards) do
        local slot = obj:get_prop('Slot')
        if slot and self.npu_need_collect_count_table[slot] == 1 then
            self:update_npu_log(obj)
        end
    end
end

-- 收集指定NpuBoard的MCU日志
function unit_manager:update_npu_log(obj)
    local slot = obj:get_prop('Slot')
    if not slot then
        log:error('NpuBoard Slot is nil')
        return
    end
    -- 若对应MCU已经触发收集则不再收集，记录待收集标志，在当前MCU在当前日志收集完成后再触发日志收集
    if self.npu_collecting_count_table[slot] and self.npu_collecting_count_table[slot] ~= 0 then
        if not self.npu_need_collect_count_table[slot] or self.npu_need_collect_count_table[slot] == 0 then
            self.npu_need_collect_count_table[slot] = 1
            log:notice('need dump NpuBoard%s mcu log later', slot)
        end
        return
    end
    -- 若已经触发全量MCU日志收集则不再收集
    if self.npu_collecting_count == 0 then
        self.npu_collecting_count_table[slot] = 1
        self.npu_need_collect_count_table[slot] = 0
        cmn.skynet.fork(function()
            obj:update_dump_log()
            -- 确保指定MCU日志收集完成后10分钟内不会再次触发指定MCU日志收集
            cmn.skynet.sleep(60000)
            self.npu_collecting_count_table[slot] = 0
        end)
    end
end

function unit_manager:listen_mcu_dump()
    log_collector.log_dump_reset_bmc_sig:on(function()
        log:notice('receive bmc reset signature')
        self:update_npu_logs()
    end)
    log_collector.log_npu_mcu_dump_reset_os_sig:on(function()
        log:notice('receive os reset signature')
        cmn.skynet.fork_once(function()
            -- 上电后等待10分钟，确保NPU MCU已完成NPU串口日志转储
            cmn.skynet.sleep(60000)
            self:update_npu_logs()
        end)
    end)
end

function unit_manager:init(bus, db)
    self.bus = bus
    self.db = db
    self.exp_boards = {}
    self.cpu_boards = {}
    self.npu_boards = {}
    self.hdd_backplanes = {}
    self.fan_boards = {}
    self.psu_boards = {}
    self.peu_boards = {}
    self.riser_cards = {}
    self.m2_transform_cards = {}
    self.device_names = {}
    self.board_slot_configs = {}
    self.logic_fw = {}
    self.psr_position = nil
    self.exp_board_position = {}
    self.cpu_board_position = {}
    self.npu_vrd_monitor_task = false
    self.npu_log_dump_mcu_error_task_flag = false
    -- 存储所有天池组件对象句柄，key:position - vlaue:object
    self.unit_collection = {}
    -- 用于硬盘背板的StartSlot定制
    self.customize_sign = db.CustomizeSign({Id = 'customsize_sign'})
    self.npu_collecting_count = 0
    -- 用于记录各个MCU日志的当前收集状态，即当前MCU是否在收集日志
    self.npu_collecting_count_table = {}
    -- 用于记录各个MCU日志的待收集状态，即当前MCU是否需要再次触发收集日志
    self.npu_need_collect_count_table = {}
    self.npu_board_map = {}
    self:listen_mcu_dump()
    -- 启动串行CPLD状态检测任务
    cmn.skynet.fork(function()
        self:start_serial_cpld_status_task()
    end)
    -- 启动串行Riser卡MCU状态检测任务
    cmn.skynet.fork(function()
        self:start_serial_riser_mcu_status_task()
    end)
    log:notice('[unit_manager] init finished.')
end

local header_names = {
    ['cpu_boards'] = 'CPU board Info\n',
    ['npu_boards'] = 'NPU board Info\n',
    ['fan_boards'] = 'FAN board Info\n',
    ['psu_boards'] = 'PSU board Info\n',
    ['peu_boards'] = 'PEU board Info\n',
    ['exp_boards'] = 'EXP board info\n',
    ['hdd_backplanes'] = 'HDD Backplane Info\n',
    ['riser_cards'] = 'Riser Card Info\n',
    ['m2_transform_cards'] = 'M2 Transform Info\n'
}

-- 日志收集回调函数
function unit_manager:on_dump_cb(ctx, dest_path)
    log:notice('unit_manager:on_dump_cb start.')
    -- 目录不存在则返回
    if not utils_core.is_dir(dest_path) then
        log:error('log dump: the path doesn\'t exist')
        return
    end

    for _, v in pairs(self.cpu_boards) do
        v:on_dump_cb(dest_path)
    end
    for _, v in pairs(self.exp_boards) do
        v:on_dump_cb(dest_path)
    end
    for _, v in pairs(self.fan_boards) do
        v:on_dump_cb(dest_path)
    end
    for _, v in pairs(self.psu_boards) do
        v:on_dump_cb(dest_path)
    end
    for _, v in pairs(self.peu_boards) do
        v:on_dump_cb(dest_path)
    end
    for _, v in pairs(self.hdd_backplanes) do
        v:on_dump_cb(dest_path)
    end
    for _, v in pairs(self.npu_boards) do
        v:on_dump_mcu_log_cb()
    end

    for _, v in pairs(self.m2_transform_cards) do
        v:on_dump_cb(dest_path)
    end

    local file_name = dest_path .. '/card_info'
    local file, err = file_sec.open_s(file_name, 'a+')
    if not file then
        log:error('open file failed, err: %s', err)
        return
    end
    utils.safe_close_file(file, function()
        utils_core.chmod_s(file_name, utils.S_IRUSR | utils.S_IWUSR | utils.S_IRGRP)
        for obj_set_name, header_name in pairs(header_names) do
            if next(self[obj_set_name]) then
                self:on_dump_unit_info(file, self[obj_set_name], header_name)
            end
        end
    end)
end

local function on_dump_fpga_info(v)
    local buff = string.format(
        '%-15s | %-15s | %-15s | %-15s | %-32s | %-32s | %-15s | %-15s | %-15s | %-15s | %-15s\n',
        'Name', 'Position', 'Manufacturer', 'Slot', 'Description', 'UID', 'SerialNum', 'PartNum', 'PCB Ver',
        'FPGAVersion', 'HWSRVersion')
    return buff .. string.format(
        '%-15s | %-15s | %-15s | %-15u | %-32s | %-32s | %-15s | %-15s | %-15s | %-15s | %-10s\n',
        v:get_prop('Name'), v:get_prop('Position'), v:get_prop('Manufacturer'), v:get_prop('Slot'),
        v:get_prop('Description'), v:get_prop('UID'), v:get_prop('SerialNumber'), v:get_prop('PartNumber'),
        v:get_prop('PcbVersion'), v:get_prop('MultiLogicVersion').FPGA, v:get_prop('SRVersion'))
end

-- 组件日志收集
function unit_manager:on_dump_unit_info(file, obj_set, header_name)
    file:write(header_name)
    local buff = string.format(
        '%-15s | %-15s | %-15s | %-15s | %-32s | %-32s | %-15s | %-15s | %-15s | %-15s | %-15s\n',
        'Name', 'Position', 'Manufacturer', 'Slot', 'Description', 'UID', 'SerialNum', 'PartNum', 'PCB Ver',
        'CPLDVersion', 'HWSRVersion')
    for _, v in pairs(obj_set) do
        buff = buff .. string.format(
            '%-15s | %-15s | %-15s | %-15u | %-32s | %-32s | %-15s | %-15s | %-15s | %-15s | %-10s\n',
            v:get_prop('Name'), v:get_prop('Position'), v:get_prop('Manufacturer'), v:get_prop('Slot'),
            v:get_prop('Description'), v:get_prop('UID'), v:get_prop('SerialNumber'), v:get_prop('PartNumber'),
            v:get_prop('PcbVersion'), v:get_prop('LogicVersion'), v:get_prop('SRVersion'))
        if v:get_prop('MultiLogicVersion').FPGA then
            buff = buff .. on_dump_fpga_info(v)
        end
    end
    buff = buff .. '\r\r'
    file:write(buff)
    log:notice('%s collect finish', header_name)
end

function unit_manager:get_npu_by_slot(slot)
    for _, v in pairs(self.npu_boards) do
        if v:get_prop('Slot') == slot then
            return v
        end
    end
end

function unit_manager:reset_npu(obj, context, slot)
    local npu_board = self:get_npu_by_slot(slot)
    if not npu_board then
        log:error('[unit_manager] reset npu failed, slot(%s) invalid', slot)
        return
    end

    local ok, res = pcall(function()
        npu_board:set_prop('Reset', 0x0)
    end)

    if not ok then
        log:error('[unit_manager] reset npu failed, err %s', res)
        log:operation(context:get_initiator(), 'general_hardware', 'Reset all chips of NPU board %s failed', slot)
        return
    end

    log:notice('[unit_manager] reset npu success.')
    log:operation(context:get_initiator(), 'general_hardware', 'Reset all chips of NPU board %s successfully', slot)
end

local REGION_PROP<const> = {
    [0x0] = {
        'GlobalReset', 'GlobalResetLocked', 'Global'
    },
    [0x1] = {
        'ComputingUnitReset', 'ComputingUnitResetLocked', 'ComputingUnit'
    }
}

function unit_manager:post_reset(npu_board, prop)
    local ok, res = pcall(function()
        npu_board:set_prop(prop[RESET_INDEX], UNRESET)
    end)
    if not ok then
        log:error('[unit_manager]unreset npu board fail, err %s', res)
    end
    return ok
end

function unit_manager:reset_npu_region(npu_board, context, region_id)
    local prop = REGION_PROP[region_id]
    if not prop then
        log:error('[unit_manager]reset npu device fail, doamin id(%s) invalid', region_id)
        error(base_messages.PropertyValueNotInList(region_id, 'DomainId'))
    end
    local ok, res = pcall(function()
        npu_board:set_prop(prop[LOCK_INDEX], RESET_UNLOCK)
        npu_board:set_prop(prop[RESET_INDEX], RESET)
    end)
    if not ok then
        log:error('[unit_manager]reset npu device(doamin:%s) fail, err is %s', region_id, res)
        self:post_reset(npu_board, prop)
        log:operation(context:get_initiator(), 'general_hardware',
            'Reset NPU Board %s Domain failed', prop[REGION_NAME_IDNEX])
        error(base_messages.InternalError())
    else
        ok = self:post_reset(npu_board, prop)
        if not ok then
            log:operation(context:get_initiator(), 'general_hardware',
                'Reset NPU Board %s Domain failed', prop[REGION_NAME_IDNEX])
            error(base_messages.InternalError())
        else
            log:operation(context:get_initiator(), 'general_hardware',
                'Reset NPU Board %s Domain successfully', prop[REGION_NAME_IDNEX])
        end
    end
end

function unit_manager:reset_npu_device(obj, context, region_id)
    local npu_board = self.npu_board_map[obj.path]
    if not npu_board then
        log:error('[unit_manager] reset npu device failed')
        return false
    end

    if npu_board:get_prop('BusType') == NPU_REGION_TYPE then
        return self:reset_npu_region(npu_board, context, region_id)
    else
        return self:reset_npu(obj, context, npu_board:get_prop('Slot'))
    end
end

-- 通过position在天池组件合集种找到指定对象句柄
function unit_manager:get_board_obj_by_position(position)
    return self.unit_collection[position]
end

function unit_manager:npu_mpdule_power_cap(bus, module_id)
    local id = module_id * 2 - 1 --根据模组id获取对应模组的主die id，当前硬件只支持通过主die查询模组功耗封顶
    local _, powercap = imu_cmd.get_npu_mpdule_power_cap(bus, id)
    if powercap then
        local npuboard = self:get_npu_by_slot(module_id)
        if not npuboard then
            log:error('[unit_manager] update npu powercap faild, slot(%s) invalid', module_id)
            return
        end
        -- powercap 单位W
        npuboard:set_prop('PowerCapWatts', powercap // 1000, 'bmc.kepler.Systems.Board.NpuBoard')
    end
end

function unit_manager:set_npu_power_cap_reduce(bus, power_cap_reduce)
    local set_flag = false
    for _, npu_board in pairs(self.npu_boards) do
        if npu_board:get_prop('BusType') ~= NPU_REGION_TYPE then
            log:error('current npuboard bustype can not set access npuid')
            return false
        end
        local ok = pcall(function ()
            if not set_flag then
                npu_board:set_prop('PowerCapReduce', power_cap_reduce)
                set_flag = true
            end
        end)
        if not ok or npu_board:get_prop('PowerCapReduce') ~= power_cap_reduce then
            log:error('set powercap reduce to cpld failed')
            return false
        end
        local board_id = npu_board:get_prop('Slot')
        local _, power_cap, max_power_cap = imu_cmd.get_npu_mpdule_power_cap(bus, board_id)
        if power_cap then
            npu_board:set_prop('PowerCapWatts', power_cap, 'bmc.kepler.Systems.Board.NpuBoard')
        end
        if max_power_cap then
            npu_board:set_prop('MaxPowerCapWatts', max_power_cap, 'bmc.kepler.Systems.Board.NpuBoard')
        end
    end
    return true
end

function unit_manager:set_npu_power_cap(obj, ctx, configs)
    local failed_list = {}
    local statu
    for k, v in pairs(configs) do
        --入参configs为字典，key是npuid，value是对应id的power偏移量
        log:debug('configs k=%s, id=%s, power=%s', k, v.NpuId, v.Power)

        if v.NpuId == ALL_NPU_BOARD_NUM then
            local ok = self:set_npu_power_cap_reduce(self.bus, v.Power)
            if not ok then
                failed_list[#failed_list + 1] = v.NpuId
            end
            goto continue
        end

        statu = imu_cmd.set_npu_cap_param(self.bus, v.NpuId, v.Power)
        if statu ~= comp_code.Success then
            failed_list[#failed_list + 1] = v.NpuId
        else
            self:npu_mpdule_power_cap(self.bus, v.NpuId)
        end
        ::continue::
        cmn.skynet.sleep(50)
    end
    return failed_list
end

function unit_manager:unregister_cpld_firmware_info(position)
    for _, v in pairs(self.logic_fw) do
        if v.position == position then
            c_logic_fw.unregister_cpld(v, v.id)
        end
    end
end

function unit_manager:get_npu_metric_collection_items(obj)
    local npu_board = self.npu_board_map[obj.path]
    if not npu_board then
        log:error('[unit_manager] get_npu_metric_collection_items failed')
        return '', {}, {}, {}
    end

    local Classification = {
        {PropName = 'Manufacturer', PropVal = npu_board:get_prop('Manufacturer')},
        {PropName = 'Model', PropVal = npu_board:get_prop('Model')},
        {PropName = 'ComputeCapability', PropVal = npu_board:get_prop('ComputeCapability')}
    }
    local Identification = {
        {PropName = 'SerialNumber', PropVal = tostring(npu_board:get_prop('SerialNumber'))},
        {PropName = 'BoardID', PropVal = tostring(npu_board:get_prop('BoardID'))},
        {PropName = 'Name', PropVal = npu_board:get_prop('Name')},
        {PropName = 'BoardType', PropVal = npu_board:get_prop('BoardType')},
        {PropName = 'NodeId', PropVal = tostring(npu_board:get_prop('NodeId'))}
    }
    
    return 'NpuBoard', Classification, Identification, NPUBOARD_METRIC_NAME
end

local function get_npuboard_powerwatts(data, obj)
    table.insert(data, tostring(obj:get_prop('PowerWatts', 'bmc.kepler.Systems.Board.NpuBoard')))
end

local npuboard_metric_func_table = {
    [NPUBOARD_POWERWATTS] = { metric_func = get_npuboard_powerwatts }
}

local function get_npuboard_metric_data(obj, metric_name)
    log:debug('metric_name = %s', metric_name)
    local data = {}
    if npuboard_metric_func_table[metric_name] ~= nil then
        npuboard_metric_func_table[metric_name].metric_func(data, obj)
    else
        log:info('invalid metric, metric_name = %s', metric_name)
        return {}
    end
    -- 异常值不进行数据采集
    for index, value in pairs(data) do
        if value == '32768' or value == '16384' or value == '65535' or value == '255' then
            return {}
        end
    end
    return data
end

function unit_manager:get_npu_metric_collection_data(obj, metric_name)
    local val, data = {}, {}
    local npu_board = self.npu_board_map[obj.path]
    if not npu_board then
        log:error('[unit_manager] get_npu_metric_collection_data failed')
        return val
    end
    for _, value in pairs(metric_name) do
        data = get_npuboard_metric_data(npu_board, value)
        if next(data) ~= nil then
            table.insert(val, { MetricName = value, Data = data })
        end
    end
    return val
end

-- 串行CPLD状态检测任务
function unit_manager:start_serial_cpld_status_task()
    log:notice('[UnitManager] start serial CPLD status update task')
    cmn.skynet.sleep(18000)

    while true do
        -- 遍历所有unit对象，串行执行CPLD状态检测
        for position, unit_obj in pairs(self.unit_collection) do
            if unit_obj and type(unit_obj.cpld_self_test) == 'function' then
                local ok, err = pcall(function()
                    unit_obj:cpld_self_test()
                end)
                if not ok then
                    log:info('[UnitManager] CPLD self test failed for position %s: %s', position, err)
                end
                -- 串行执行，每个unit之间间隔100ms
                cmn.skynet.sleep(10)
            end
        end
        -- 每轮检测间隔60秒
        cmn.skynet.sleep(6000)
    end
end

-- 串行Riser卡MCU状态检测任务
function unit_manager:start_serial_riser_mcu_status_task()
    log:notice('[UnitManager] start serial Riser MCU status update task')
    cmn.skynet.sleep(12000)     -- 启动阶段等待2分钟后更新MCU状态

    while true do
        -- 遍历所有Riser卡对象，串行执行MCU状态检测
        for _, riser_obj in pairs(self.riser_cards) do
            if riser_obj and riser_obj:get_prop('MCUVersion') ~= 'N/A' then
                local ok, err = pcall(function()
                    c_riser_card.update_mcu_status(riser_obj)
                end)
                if not ok then
                    log:info('[UnitManager] Riser MCU status update failed for %s: %s',
                        riser_obj:get_prop('DeviceName') or 'Unknown', err)
                end
                -- 串行执行，每个Riser卡之间间隔50ms
                cmn.skynet.sleep(5)
            end
        end
        -- 每轮检测间隔5秒
        cmn.skynet.sleep(500)
    end
end


return singleton(unit_manager)