-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at:
--         http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.

-- Description: PCIE Group管理
local mdb = require 'mc.mdb'
local class = require 'mc.class'
local singleton = require 'mc.singleton'
local log = require 'mc.logging'
local cmn = require 'common'
local c_pcie_device = require 'device.class.pcie_device'
local c_pcie_card = require 'device.class.pcie_card'
local c_ocp_card = require 'device.class.ocp_card'
local c_eth_device = require 'device.class.eth_device'
local c_pcie_card_flash_checker = require 'device.class.pcie_card_flash_checker'
local msg = require 'pcie_device.types.message'
local file_sec = require 'utils.file'
local utils = require 'mc.utils'
local log_helper = require 'infrastructure.log_helper'
local bs = require 'mc.bitstring'
local client = require 'pcie_device.client'
local skynet = require 'skynet'
local ipmi = require 'ipmi'
local comp_code = ipmi.types.Cc
local mc_signal = require 'mc.signal'
local event = require 'infrastructure.event'
local cjson = require 'cjson'
local common_interface = require 'common_interface'

local c_device_service = class()

local RET_OK<const> = 0
local RET_ERR<const> = 1
local NULL_VAL<const> = 'N/A'
local BANDWIDTH_REDUCTION<const> = 0
local LINK_SPEED_REDUCED<const> = 1
local PACKAGE_PATH<const> = '/bmc/kepler/Managers/1/Package'
local SYS_CHAN_NUM<const> = 15
local UINT8_MAX<const> = 0xff
local PCIE_CARD_INIT_ABNORMAL_EVENT_KEY_ID = 'PCIeCard.PCIeCardInitAbnormal'
local PCIE_CARD_INIT_ABNORMAL_EVENT_CODE = '0x080000A5'
local PCIE_CARD_FLASH_FAULT = "Flash fault"
local PCIE_CARD_FRU_TYPE<const> = 8
local FLASH_FAULT_DEASSEART<const> = 0
local FLASH_FAULT_ASSEART<const> = 1

function c_device_service:on_add_object(class_name, mds_obj, position)
    local switch = {
        ['PCIeDevice'] = function()
            local obj = c_pcie_device.new(mds_obj, position, self.bus)
            table.insert(self.pcie_device_list, obj)
        end,
        ['PCIeCard'] = function()
            local obj = c_pcie_card.new(mds_obj, position, self.bus, self.reset_local_db)
            table.insert(self.pcie_card_list, obj)
        end,
        ['OCPCard'] = function()
            local obj = c_ocp_card.new(mds_obj, position, self.bus, self.reset_local_db)
            table.insert(self.ocp_card_list, obj)
        end,
        ['EthDevice'] = function()
            local obj = c_eth_device.new(mds_obj, position, self.bus)
            table.insert(self.eth_device_list, obj)
        end,
        ['PCIeCardFlashChecker'] = function()
            local obj = c_pcie_card_flash_checker.new(mds_obj, position, self.bus)
            table.insert(self.flash_checker_list, obj)
        end
    }

    if switch[class_name] then
        log:notice('[DeviceService] Add object, class: %s, position: %s', class_name, position)
        switch[class_name]()
        log:notice('[DeviceService] Add object successfully, class: %s, position: %s', class_name, position)
    end
end

local function remove_ele_by_position(list, position)
    for i, obj in pairs(list) do
        if obj.position == position then
            table.remove(list, i)
            break
        end
    end
end

function c_device_service:on_delete_object(class_name, mds_obj, position)
    local switch = {
        ['PCIeDevice'] = function()
            remove_ele_by_position(self.pcie_device_list, position)
        end,
        ['PCIeCard'] = function()
            remove_ele_by_position(self.pcie_card_list, position)
        end,
        ['OCPCard'] = function()
            remove_ele_by_position(self.ocp_card_list, position)
        end,
        ['EthDevice'] = function()
            remove_ele_by_position(self.eth_device_list, position)
        end,
        ['PCIeCardFlashChecker'] = function()
            remove_ele_by_position(self.flash_checker_list, position)
        end
    }

    if switch[class_name] then
        log:notice('[DeviceService] Delete object, class: %s, position: %s', class_name, position)
        switch[class_name]()
    end
end

local function get_latest_alarm_list(bus)
    local latest_alarm_list = {}
    local ok, alarm_list = pcall(event.get_latest_alarm_list, bus)
    if ok and alarm_list then
        log:debug('get_latest_alarm_list successfully.')
    else
        log:notice('get_latest_alarm_list failed, ok: %s, alarm_list: %s', ok , alarm_list)
        return latest_alarm_list
    end

    local alarm_msg_args, slot
    for _, alarm in pairs(alarm_list) do
        if not alarm.EventCode or alarm.EventCode ~= PCIE_CARD_INIT_ABNORMAL_EVENT_CODE then
            log:debug('alarm.EventCode(%s) not match', alarm.EventCode)
            goto continue
        end

        if not alarm.MessageArgs or alarm.MessageArgs == "" then
            log:debug('alarm.MessageArgs empty')
            goto continue
        end

        ok, alarm_msg_args = pcall(cjson.decode, alarm.MessageArgs)
        if not ok then
            log:debug('alarm.MessageArgs decode failed, err: %s', alarm_msg_args)
            goto continue
        end
        if not alarm_msg_args[4] or alarm_msg_args[4] ~= PCIE_CARD_FLASH_FAULT then
            goto continue
        end

        slot = alarm_msg_args[2]
        latest_alarm_list[slot] = alarm
        ::continue::
    end

    return latest_alarm_list
end

local function update_init_abnormal_event(bus, status, slot, card_des, device_name)
    local ok, resp, msg_args
    local alarm_list = get_latest_alarm_list(bus)
    if slot == 0 or slot == 255 then
        return false
    end

    if status == FLASH_FAULT_DEASSEART and alarm_list[slot] then
        -- 消除告警
        msg_args = cjson.decode(alarm_list[slot].MessageArgs)
        ok, resp = pcall(cmn.generate_event, PCIE_CARD_INIT_ABNORMAL_EVENT_KEY_ID, msg_args, 'false',
            alarm_list[slot].ComponentName, PCIE_CARD_FRU_TYPE)
        if not ok then
            log:debug('deassert event failed, error: %s', resp)
            return false
        else
            log:notice('deassert event successfully, args: %s', cjson.encode(msg_args))
            return true
        end
    end

    if status == FLASH_FAULT_ASSEART and not alarm_list[slot] then
        -- 产生告警
        msg_args = {'', slot, card_des, PCIE_CARD_FLASH_FAULT}
        ok, resp = pcall(cmn.generate_event, PCIE_CARD_INIT_ABNORMAL_EVENT_KEY_ID, msg_args, 'true',
            device_name, PCIE_CARD_FRU_TYPE)
        if not ok then
            log:debug('assert event failed, error: %s', resp)
            return false
        else
            log:notice('assert event successfully, args: %s', cjson.encode(msg_args))
            return true
        end
    end
    return true
end

local function update_flash_checker(card_obj, position, flash_checker_list, bus)
    local checker, vendor_id, device_id, card_des, slot, device_name, id_str, check_ret
    for i, obj in pairs(flash_checker_list) do
        if obj.position == position then
            checker = obj
            break
        end
    end
    slot = card_obj:get_prop('SlotID')
    if not checker then
        if not card_obj:need_check() then
            return
        end
        update_init_abnormal_event(bus, FLASH_FAULT_DEASSEART, slot, '', '')
        card_obj:add_check_count()
        return
    end
    if checker and (not checker.id_list or not checker:need_check()) then
        return
    end
    vendor_id = card_obj:get_prop('VendorID')
    device_id = card_obj:get_prop('DeviceID')
    device_name = card_obj:get_prop('DeviceName')
    id_str = string.format('%04x_%04x', vendor_id, device_id)
    card_des = checker.id_list[id_str]
    if card_des then
        card_obj:update_description(card_des)
        checker:update_fault_status(FLASH_FAULT_ASSEART)
        check_ret = update_init_abnormal_event(bus, FLASH_FAULT_ASSEART, slot, card_des, device_name)
    else
        checker:update_fault_status(FLASH_FAULT_DEASSEART)
        check_ret = update_init_abnormal_event(bus, FLASH_FAULT_DEASSEART, slot, card_des, device_name)
    end
    if check_ret then
        checker:add_check_count()
    end
end

local function pcie_and_ocp_task(device_obj, list, position, pcie_addr_info_obj, flash_checker_list, bus)
    local card_obj = cmn.find(list, function(item)
        if item.position == position then
            return true
        end
    end)
    local ok, err
    if card_obj then
        cmn.skynet.fork_loop({count = 0}, function ()
            card_obj:update_pcie_vpd_info(device_obj)
            card_obj:update_vid_did(pcie_addr_info_obj)
            while true do
                ok, err = pcall(function()
                    -- 1. 获取LinkWidth，LinkWidthAbility
                    card_obj:get_pcie_lang_info(device_obj)
                    -- 2. 开始同步属性
                    card_obj:sync_info_to_card(device_obj)
                    -- 3. 判断产生或消除Flash初始化告警
                    update_flash_checker(card_obj, position, flash_checker_list, bus)
                end)
                if not ok then
                    log:debug('[pcie_device] Error=%s', err)
                end
                cmn.skynet.sleep(60 * 100)
            end
        end)
    end
end

function c_device_service:clear_flash_checker_event(slot)
    local retry = 120 -- 最大等待120秒
    while retry > 0 do
        update_init_abnormal_event(self.bus, FLASH_FAULT_DEASSEART, slot, '', '')
        cmn.skynet.sleep(100) -- 等待1秒后查询当前告警
        local alarm_list = get_latest_alarm_list(self.bus)
        if alarm_list and not alarm_list[slot] then
            return
        end
        retry = retry - 1
    end
end

-- 注册PCIeDevice对象属性变化监听，用来设置设备树对象属性。 
function c_device_service:register_dev_prop_changed(pcie_device_obj, position)
    pcie_device_obj.dev_prop_changed:on(function (object_name, prop, value)
        self.dev_prop_changed:emit(object_name, prop, value)
    end)
end

-- 注册任务，从PcieAddrInfo同步属性值到PCIeDevice完成后需要执行
function c_device_service:register_get_pcie_info_task(pcie_device_obj, position)
    pcie_device_obj.sync_pai_to_device_complete:on(function (device_obj, pcie_addr_info_obj)
        -- 每个position对应唯一一个PCIeCard或OCPCard
        pcie_and_ocp_task(device_obj, self.pcie_card_list, position, pcie_addr_info_obj, self.flash_checker_list,
            self.bus)
        pcie_and_ocp_task(device_obj, self.ocp_card_list, position, pcie_addr_info_obj, self.flash_checker_list,
            self.bus)
        -- 同步设备BDF之后，获取ClassCode
        device_obj:query_from_pmu_task()
        -- 开启同步拓扑BDF到ethDevice的任务
        local eth_obj_list = {}
        for _, eth_obj in pairs(self.eth_device_list) do
            if eth_obj.position == position then
                table.insert(eth_obj_list, eth_obj)
            end
        end
        if #eth_obj_list ~= 0 then
            self:sync_eth_device_bdf_info(eth_obj_list, device_obj.ref_pcie_addr_info)
        end
    end)
end

function c_device_service:on_add_object_complete(position)
    cmn.skynet.fork(function()
        -- 每个position对应唯一一个PCIeDevice
        local pcie_device_obj = cmn.find(self.pcie_device_list, function(item)
            if item.position == position then
                return true
            end
        end)
        if pcie_device_obj then
            self:register_dev_prop_changed(pcie_device_obj)

            -- 1. 注册任务，从PcieAddrInfo同步属性值到PCIeDevice完成后需要执行
            self:register_get_pcie_info_task(pcie_device_obj, position)
 
            -- 2. 开始同步属性值
            self:sync_pcie_addr_info_to_device(pcie_device_obj)
        end
    end)
end

function c_device_service:contain_raid()
    for _, obj in pairs(self.pcie_card_list) do
        if obj:is_raid() then
            return true
        end
    end
    return false
end

-- 更新下行连接器的槽位规范版本到PCIeDevice对象的PCIeType
function c_device_service:update_pcie_type(biz_topo_service, obj)
    local pcie_type = biz_topo_service:method_get_pcie_type_by_pos(obj.position)
    obj:set_prop('PCIeType', pcie_type)
end

-- 等待obj_list中所有EthDevice同步完成,超时120s
local function retry_wait_sync_ready(obj_list)
    local retry = 120
    local ready
    while retry > 0 do
        ready = true
        for _, eth_obj in pairs(obj_list) do
            ready = ready and eth_obj.sync_ready
        end
        if ready then
            return true
        end
        retry = retry - 1
        cmn.skynet.sleep(100)
    end
    return false
end

-- obj_list: 同一张网卡上所有的EthDevice对象集合
-- pfid_map: 每个DieId和起始PfId的对应关系表
-- PfId分配规则： 每个Die的PfId为网卡相对物理槽位，从起始StartPfId往后开始编号，向后递增，对应网卡上9545物理通道号
-- 具体逻辑如下：
-- 1、找到相同bcu，socketid，dieid下所有的EthDevice对象
-- 2、根据PortId升序排列
-- 3、从StartPfId开始，依次向后赋值
local function set_pfid_bdf_foreach(obj_list, pfid_map)
    local port_list = {}
    local idx_list = {}
    local idx
    for _, eth_obj in pairs(obj_list) do
        -- bcu_idx和die_id组合成唯一的dieid标志
        idx = (eth_obj.bcu_idx << 16) + eth_obj.die_id
        if not port_list[idx] then
            port_list[idx] = {}
        end
        table.insert(port_list[idx], eth_obj)
        table.insert(idx_list, idx)
    end
    local objs_per_die
    for _, idx in pairs(idx_list) do
        objs_per_die = port_list[idx]
        table.sort(objs_per_die, function(a, b)
            if a.mds_obj.PortId < b.mds_obj.PortId then
                return true
            else
                return false
            end
        end)
        for i, obj in ipairs(objs_per_die) do
            if pfid_map[obj.bcu_idx][obj.die_id] then
                obj.mds_obj.Device = 0
                obj.mds_obj.Function = i - 1
                obj.mds_obj.PortFunctionId = i + pfid_map[obj.bcu_idx][obj.die_id] - 1
                log:notice("[DeviceService] Set EthDevice %s, Function %s, PortFunctionId %s",
                    obj.mds_obj.name, obj.mds_obj.Function, obj.mds_obj.PortFunctionId)
            end
        end
    end
end

-- obj_list：同一张网卡上所有EthDevice对象集合
-- pfid_map：每个DieId和起始StartPfId的对应关系表
local function set_bdf_pfid(pfid_map, obj_list)
    cmn.skynet.fork(function()
        local ok = retry_wait_sync_ready(obj_list)
        if not ok then 
            log:error("[DeviceService] wait_sync_ready fail")
            return
        end
        set_pfid_bdf_foreach(obj_list, pfid_map)
    end)
end

function c_device_service:sync_eth_device_bdf_info(obj_list, addr_obj)
    if not next(obj_list) or not addr_obj then
        return
    end
    local biz_topo_service = self.service_manager:get_service('biz_topo')
    if not biz_topo_service then
        log:error('[DeviceService] Get sub service biz_topo failed.')
        return
    end
    local target_c = biz_topo_service:method_get_target_conn_by_addr_obj(addr_obj)
    if not target_c then
        log:error('[DeviceService] find target_c by %s failed.', addr_obj.mds_obj.name)
        return
    end
    -- obj_list：同一张网卡上所有ethdevice对象集合
    for _, eth_obj in pairs(obj_list) do
        eth_obj:sync_bdf_info(target_c)
    end
    set_bdf_pfid(biz_topo_service.biz_topo.pfid_map, obj_list)
end

-- 关联PCIeAddrInfo对象，同步槽位号和BDF信息到PCIeDevice对象
function c_device_service:sync_pcie_addr_info_to_device(obj)
    local pcie_addr_info
    local biz_topo_service = self.service_manager:get_service('biz_topo')
    if not biz_topo_service then
        log:error('[DeviceService] Get sub service biz_topo failed.')
        return
    end
    cmn.skynet.fork(function()
        local retry = 120
        -- 通过BIOS上报加载的PCIe设备（需要更新SlotID），PCIeAddrInfo对象在PCIeDevice的上一级
        while retry > 0 do
            -- SlotId或者devicename已经设置成功的直接跳过
            if obj:get_prop('SlotID') ~= 0 or obj.device_name_ready then
                log:notice('[DeviceService] device_name_ready, position=%s', obj.position)
                return
            end
            pcie_addr_info = biz_topo_service:method_get_pcie_addr_by_pos(obj.position)
            if pcie_addr_info then
                obj:sync_pcie_addr_info(pcie_addr_info)
                self:update_pcie_type(biz_topo_service, obj)
                return
            end
            retry = retry - 1
            cmn.skynet.sleep(100)
        end
    end)

    cmn.skynet.fork(function()
        -- 其他PCIe设备，静态SlotID，通过DeviceType和SlotID匹配
        local retry = 120
        local pcie_addr_info_list
        while retry > 0 do
            -- 未配置的或者已经设置成功的直接跳过
            if obj:get_prop('DeviceType') == 0 or obj.device_name_ready then
                return
            end
            pcie_addr_info_list = biz_topo_service.biz_topo:get_objs('PCIeAddrInfo')
            for _, addr_obj in pairs(pcie_addr_info_list) do
                if addr_obj:get_prop('ComponentType') == obj:get_prop('DeviceType') and
                    addr_obj:get_prop('SlotID') == obj:get_prop('SlotID') and addr_obj:get_prop('SlotID') ~= 0 then
                    obj:sync_pcie_addr_info(addr_obj)
                    self:update_pcie_type(biz_topo_service, obj)
                    return
                end
            end
            retry = retry - 1
            cmn.skynet.sleep(100)
        end

        log:error(
            '[DeviceService] Can not find matching PCIeAddrinfo, PCIeDevice position=%s DeviceType=%s SlotID=%s',
            obj.position, obj:get_prop('DeviceType'), obj:get_prop('SlotID'))
    end)
end

-- 从资源树上获取其他组件管理的pcie_card对象
function c_device_service:get_other_pcie_cards()
    -- general_hardware: DPUCard
    local ok, obj_list = pcall(mdb.get_sub_objects, self.bus,
        '/bmc/kepler/Systems/1/PCIeDevices/PCIeCards/DPUCards', 'bmc.kepler.Systems.PCIeDevices.PCIeCard')
    if ok then
        return obj_list
    end
end

-- 找到同一级的pcie_device对象，返回bdf
function c_device_service:find_bdf(position)
    for _, v in pairs(self.pcie_device_list) do
        if position == v.position then
            return v:get_prop('Bus'), v:get_prop('Device'), v:get_prop('Function')
        end
    end
    return 0xff, 0xff, 0xff  -- 返回无效值
end

local function get_physicalid_by_logicalid(logical_id)
    local physical_id = 1
    client:ForeachCPUObjects(function(obj)
        if obj.LogicalId == logical_id then
            physical_id = obj.PhysicalId
            return
        end
    end)
    return physical_id
end

local function get_associated_resource(cpu_id)
    if cpu_id == 0xff then
        return 'PCIeSwitch'
    elseif cpu_id == 0xfe then
        return 'PCH'
    elseif cpu_id == 0xfd then
        return 'CPU1,CPU2'
    elseif cpu_id == 252 then
        return 'CPU1,CPU2,CPU3,CPU4'
    else
        return 'CPU' .. (get_physicalid_by_logicalid(cpu_id))
    end
end

function c_device_service:method_card_info_dump(ctx, path)
    local others = self:get_other_pcie_cards() or {}
    if not next(others) and not next(self.pcie_card_list) and not next(self.ocp_card_list)then
        return
    end

    log:notice('[DeviceService] App log dump.')
    local file = file_sec.open_s(path .. '/card_info', 'w+')
    if not file then
        log:error('[DeviceService] method_card_info_dump: open dir fail')
        return
    end

    utils.safe_close_file(file, function()
        if next(others) or next(self.pcie_card_list) then
            file:write('Pcie Card Info\n')
            local buff = string.format('%-4s | %-10s | %-10s | %-15s | %-15s | %-12s | %-14s | %-16s | %-40s |' ..
                ' %-10s | %-14s | %-14s | %-8s | %-40s | %-40s | %-40s | %-40s\n', 'Slot', 'Vendor Id', 'Device Id',
                'Sub Vendor Id', 'Sub Device Id', 'Bus Number', 'Device Number', 'Function Number', 'Card Desc',
                'Board Id', 'PCB Version', 'CPLD Version', 'PartNum', 'Manufacturer',
                'Position', 'AssociatedResource', 'ProductName')
            local format = '%-4u | 0x%04x     | 0x%04x     | 0x%04x          | 0x%04x          | 0x%02x' ..
                '         | 0x%02x           | 0x%02x             | %-40s | %-10s | %-14s | %-14s | %-8s | %-40s ' ..
                '| %-40s | %-40s | %-40s\n'
            for _, v in pairs(self.pcie_card_list) do
                local b, d, f = self:find_bdf(v.position)
                buff = buff .. string.format(format, v:get_prop('SlotID'), v:get_prop('VendorID'), 
                    v:get_prop('DeviceID'), v:get_prop('SubVendorID'), v:get_prop('SubDeviceID'), b, d, f,
                    v:get_prop('Description'), v:get_prop('BoardID'), v:get_prop('PcbVersion'),
                    'N/A', v:get_prop('PartNumber'), v:get_prop('Manufacturer'), v:get_prop('Position'),
                    get_associated_resource(v:get_prop('LaneOwner')), v:get_prop('Name'))
                skynet.sleep(10)
            end
        
            for _, v in pairs(others) do
                local position = string.sub(v.path, string.find(v.path, '[0-9]+$'))
                local b, d, f = self:find_bdf(position)
                buff = buff .. string.format(format, v.SlotID, v.VendorID, v.DeviceID,
                    v.SubVendorID, v.SubDeviceID, b, d, f,
                    v.Description, v.BoardID, v.PcbVersion,
                    'N/A', v.PartNumber, v.Manufacturer, v.Position,
                    get_associated_resource(v.LaneOwner), v.Name)
                skynet.sleep(10)
            end
            file:write(buff)
        end
        
        if next(self.ocp_card_list) then
            file:write('OCP Card Info\n')
            buff = string.format('%-4s | %-10s | %-10s | %-15s | %-15s | %-12s | %-14s | %-16s | %-40s |' ..
                ' %-10s | %-14s | %-14s | %-8s | %-40s | %-40s | %-40s | %-40s\n', 'Slot', 'Vendor Id', 'Device Id',
                'Sub Vendor Id', 'Sub Device Id', 'Bus Number', 'Device Number', 'Function Number', 'Card Desc',
                'Board Id', 'PCB Version', 'CPLD Version', 'PartNum', 'Manufacturer',
                'Position', 'AssociatedResource', 'ProductName')
            format = '%-4u | 0x%04x     | 0x%04x     | 0x%04x          | 0x%04x          | 0x%02x' ..
                '         | 0x%02x           | 0x%02x             | %-40s | %-10s | %-14s | %-14s | %-8s | %-40s ' ..
                '| %-40s | %-40s | %-40s\n'
            for _, v in pairs(self.ocp_card_list) do
                local b, d, f = self:find_bdf(v.position)
                buff = buff .. string.format(format, v:get_prop('SlotID'), v:get_prop('VendorID'),
                    v:get_prop('DeviceID'), v:get_prop('SubVendorID'), v:get_prop('SubDeviceID'), b, d, f,
                    v:get_prop('Description'), v:get_prop('BoardID'), v:get_prop('PcbVersion'),
                    'N/A', v:get_prop('PartNumber'), v:get_prop('Manufacturer'), v:get_prop('Position'),
                    get_associated_resource(v:get_prop('LaneOwner')), v:get_prop('Name'))
                skynet.sleep(10)
            end
            file:write(buff)
        end
    end)
end

-- 根据SSBDF查询PCIe设备
function c_device_service:get_pcie_device_by_ssbdf(segment, socket_id, bus, device, func, host_id)
    log:debug('get_pcie_device_by_ssbdf %s-%s-%s-%s-%s-%s', segment, socket_id, bus, device, func, host_id)
    -- 对于多host，需要根据SlotID区分。
    if host_id and host_id > 1 then
        local obj = cmn.find(self.pcie_device_list, function(item)
            if item.mds.Segment == segment and item.mds.SocketID == socket_id and item.mds.Bus == bus and
                item.mds.Device == device and item.mds.Function == func and
                item.mds.MultihostPresence & (1 << (host_id - 1)) > 0 then
                return true
            end
        end)
        return obj
    end

    local obj = cmn.find(self.pcie_device_list, function(item)
        if item.mds.Segment == segment and item.mds.SocketID == socket_id and item.mds.Bus == bus and
            item.mds.Device == device and item.mds.Function == func then
            return true
        end
    end)
    return obj
end

local RAS_STATUS_BITS_DEF = bs.new([[<<
    sys1:1/little,
    sys2:1/little,
    sys3:1/little,
    sys4:1/little,
    sys5:1/little,
    sys6:1/little,
    sys7:1/little,
    sys8:1/little
>>]])

-- RAS PCIE相关告警值根据最大8个system拆分为8bit
local function set_pcie_status_by_system(src_val, system_id, set_val)
    if system_id < 1 or system_id > 8 then
        log:error('invalid system_id = %s', system_id)
        return src_val
    end
    if set_val < 0 or set_val > 1 then
        log:error('invalid set_val = %s', set_val)
        return src_val
    end
    local src_val_str = string.pack('B', src_val)
    local val = RAS_STATUS_BITS_DEF:unpack(src_val_str)
    val['sys' .. system_id] = set_val
    return RAS_STATUS_BITS_DEF:pack(val):byte()
end

function c_device_service:method_set_diagnostic_fault(ctx, segment, socket_id, bus, device, func, value)
    local host_id = tonumber(ctx.SystemId) or 1
    local obj = self:get_pcie_device_by_ssbdf(segment, socket_id, bus, device, func, host_id)
    if not obj then
        return RET_ERR
    end
    local diagnosticfault_record = self.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    local multi_value = set_pcie_status_by_system(diagnosticfault_record.DiagnosticFault,
        host_id, value)
    diagnosticfault_record.DiagnosticFault = multi_value
    diagnosticfault_record:save()
    obj.mds.DiagnosticFault = multi_value
    return RET_OK
end

function c_device_service:method_set_predictive_fault(ctx, segment, socket_id, bus, device, func, value)
    local host_id = tonumber(ctx.SystemId) or 1
    local obj = self:get_pcie_device_by_ssbdf(segment, socket_id, bus, device, func, host_id)
    if not obj then
        return RET_ERR
    end
    local predictivefault_record = self.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    local multi_value = set_pcie_status_by_system(predictivefault_record.PredictiveFault,
        host_id, value)
    predictivefault_record.PredictiveFault = multi_value
    predictivefault_record:save()
    obj.mds.PredictiveFault = multi_value
    return RET_OK
end

local function set_pcie_correctable_error(handle, obj, value, host_id)
    local ce_record = handle.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    local multi_value = set_pcie_status_by_system(ce_record.CorrectableError,
        host_id, value)
    ce_record.CorrectableError = multi_value
    ce_record:save()
    obj.mds.CorrectableError = multi_value
    return RET_OK
end

local function set_pcie_uncorrectable_error(handle, obj, value, host_id)
    local uce_record = handle.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    local multi_value = set_pcie_status_by_system(uce_record.UncorrectableError,
        host_id, value)
    uce_record.UncorrectableError = multi_value
    uce_record:save()
    obj.mds.UncorrectableError = multi_value
    return RET_OK
end

local function set_pcie_fatal_error(handle, obj, value, host_id)
    local fe_record = handle.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    local multi_value = set_pcie_status_by_system(fe_record.FatalError,
        host_id, value)
    fe_record.FatalError = multi_value
    fe_record:save()
    obj.mds.FatalError = multi_value
    return RET_OK
end

local function set_pcie_parity_error(handle, obj, value, host_id)
    local perr_record = handle.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    if not perr_record then
        return RET_ERR
    end
    local multi_value = set_pcie_status_by_system(perr_record.ParityError,
        host_id, value)
    perr_record.ParityError = multi_value
    perr_record:save()
    obj.mds.ParityError = multi_value
    return RET_OK
end

local function set_pcie_system_error(handle, obj, value, host_id)
    local serr_record = handle.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    if not serr_record then
        return RET_ERR
    end
    local multi_value = set_pcie_status_by_system(serr_record.SystemError,
        host_id, value)
    serr_record.SystemError = multi_value
    serr_record:save()
    obj.mds.SystemError = multi_value
    return RET_OK
end

function c_device_service:method_set_error_status(ctx, segment, socket_id, bus, device, func, err_status, value)
    local host_id = tonumber(ctx.SystemId) or 1
    local obj = self:get_pcie_device_by_ssbdf(segment, socket_id, bus, device, func, host_id)
    if not obj then
        log:error("get pcie device by ssbdf failed.", value)
        return RET_ERR
    end
    if err_status == 'CorrectableError' then
        log:info("Set PCIe CorrectableError == %s.", value)
        return set_pcie_correctable_error(self, obj, value, host_id)
    elseif err_status == 'UncorrectableError' then
        log:info("Set PCIe UncorrectableError == %s.", value)
        return set_pcie_uncorrectable_error(self, obj, value, host_id)
    elseif err_status == 'FatalError' then
        log:info("Set PCIe FatalError == %s.", value)
        return set_pcie_fatal_error(self, obj, value, host_id)
    elseif err_status == 'ParityError' then
        log:info("Set PCIe ParityError == %s.", value)
        return set_pcie_parity_error(self, obj, value, host_id)
    elseif err_status == 'SystemError' then
        log:info("Set PCIe SystemError == %s.", value)
        return set_pcie_system_error(self, obj, value, host_id)
    end
end

-- 根据SBDF查询PCIe设备, 不使用Socket
function c_device_service:get_pcie_device_by_sbdf(segment, bus, device, func, host_id)
    -- 对于多host，需要根据SlotID区分。
    if host_id and host_id > 1 then
        local obj = cmn.find(self.pcie_device_list, function(item)
            if item.mds.Segment == segment and item.mds.Bus == bus and item.mds.Device == device and
                item.mds.Function == func and
                item.mds.MultihostPresence & (1 << (host_id - 1)) > 0 then
                return true
            end
        end)
        return obj
    end

    local obj = cmn.find(self.pcie_device_list, function(item)
        if item.mds.Segment == segment and item.mds.Bus == bus and item.mds.Device == device and
            item.mds.Function == func then
            return true
        end
    end)
    return obj
end

function c_device_service:method_set_uce_by_bios(ctx, segment, bus, device, func, value)
    local host_id = tonumber(ctx.SystemId) or 1
    local obj = self:get_pcie_device_by_sbdf(segment, bus, device, func, host_id)
    if not obj then
        return RET_ERR
    end
    
    local ucebybios_record = self.db.PCIeDevice({GroupPosition = obj.mds.GroupPosition})
    local multi_value = set_pcie_status_by_system(ucebybios_record.UCEByBIOS,
        host_id, value)
    ucebybios_record.UCEByBIOS = multi_value
    ucebybios_record:save()
    obj.mds.UCEByBIOS = multi_value
    return RET_OK
end

local function set_pcie_link_reduced(obj, event)
    -- 获取不到每次等待1s，总计3分钟获取不到PCIe device对象则认为获取不到该对象
    local pcie_device_obj = obj:get_pcie_device_by_sbdf(event.segment, event.bus, event.device, event.func,
        event.host_id)
    if not pcie_device_obj then
        cmn.skynet.sleep(100)
    end
    if pcie_device_obj ~= nil then
        -- 设置降带宽
        if event.fault_type == BANDWIDTH_REDUCTION then
            local bandwidthreduction_record = obj.db.PCIeDevice({
                GroupPosition = pcie_device_obj.mds.GroupPosition
            })
            local multi_value = set_pcie_status_by_system(
                bandwidthreduction_record.BandwidthReduction, event.host_id, event.value)
            bandwidthreduction_record.BandwidthReduction = multi_value
            bandwidthreduction_record:save()
            pcie_device_obj.mds.BandwidthReduction = multi_value

        -- 设置降速率
        elseif event.fault_type == LINK_SPEED_REDUCED then
            -- 告警持久化
            local link_speed_reduced_record = obj.db.PCIeDevice({
                GroupPosition = pcie_device_obj.mds.GroupPosition
            })
            local multi_value = set_pcie_status_by_system(
                link_speed_reduced_record.LinkSpeedReduced, event.host_id, event.value)
            link_speed_reduced_record.LinkSpeedReduced = multi_value
            link_speed_reduced_record:save()
            pcie_device_obj.mds.LinkSpeedReduced = multi_value
        else
            log:error('Invalid fault type of event')
        end
        return true
    end
end

local function update_reduction_event_count(obj, cur_event_obj, pcie_obj_found)
    for _, event_obj in pairs(obj.reduction_event_list) do
        event_obj.countdown = event_obj.countdown - 1
    end
    if pcie_obj_found or cur_event_obj.countdown <= 0 then
        table.remove(obj.reduction_event_list, 1) -- 已处理或超时
    else
        table.insert(obj.reduction_event_list, cur_event_obj) -- 放置末尾
        table.remove(obj.reduction_event_list, 1)
    end
end

-- 处理上报的链路降带宽/降速率事件
local function reduction_event_consumer(obj)
    local power_state, pcie_obj_found
    while true do
        if #obj.reduction_event_list > 0 then
            -- 首先获取电源状态，如果是下电状态就不产生告警，并且把告警队列中的告警删除
            power_state = common_interface.get_instance().power_status[obj.reduction_event_list[1].host_id]
            if power_state == nil or power_state == 'OFF' then
                -- 释放消息队列中的所有告警，避免下电状态误告警
                obj.reduction_event_list = {}
            else
                pcie_obj_found = set_pcie_link_reduced(obj, obj.reduction_event_list[1])
                update_reduction_event_count(obj, obj.reduction_event_list[1], pcie_obj_found)
            end
        else
            cmn.skynet.wait()
        end
    end
end

function c_device_service:method_set_bandwidth_reduction(ctx, segment, bus, device, func, value)
    local host_id = tonumber(ctx.SystemId) or 1
    -- 将相关的属性设置插入到队列中
    local bandwidth_event = {
        segment = segment,
        bus = bus,
        device = device,
        func = func,
        value = value,
        fault_type = BANDWIDTH_REDUCTION,
        host_id = host_id,
        countdown = 180  -- 最多存在3分钟
    }

    table.insert(self.reduction_event_list, bandwidth_event)
    if #self.reduction_event_list == 1 then
        cmn.skynet.wakeup(self.reduction_event_co)
    end
    return RET_OK
end

function c_device_service:method_set_link_speed_reduced(ctx, segment, bus, device, func, value)
    local host_id = tonumber(ctx.SystemId) or 1
    -- 将相关的属性设置插入到队列中
    local link_speed_event = {
        segment = segment,
        bus = bus,
        device = device,
        func = func,
        value = value,
        fault_type = LINK_SPEED_REDUCED,
        host_id = host_id,
        countdown = 180  -- 最多存在3分钟
    }
    table.insert(self.reduction_event_list, link_speed_event)
    if #self.reduction_event_list == 1 then
        cmn.skynet.wakeup(self.reduction_event_co)
    end
    return RET_OK
end

function c_device_service:method_get_device_name(ctx, segment, socket_id, bus, device, func)
    local host_id = tonumber(ctx.SystemId) or 1
    local ok, obj = pcall(function()
        return self:get_pcie_device_by_ssbdf(segment, socket_id, bus, device, func, host_id)
    end)
    if not ok or not obj then
        log:error('method_get_device_name fail')
        return RET_ERR, NULL_VAL
    end
    return RET_OK, obj.mds.DeviceName
end

function c_device_service:method_get_pcie_card_num()
    return #self.pcie_card_list
end

function c_device_service:set_fault_status_by_bios(ctx, card_bus, card_device, card_function,
    fault_status)
    local host_id = tonumber(ctx.SystemId) or 1
    for _, pcie_device_obj in pairs(self.pcie_device_list) do
        if pcie_device_obj.Bus == card_bus and pcie_device_obj.Device == card_device and
            pcie_device_obj.Function == card_function and
            pcie_device_obj.MultihostPresence & (1 << (host_id - 1)) > 0 then
            pcie_device_obj.FaultByBios = fault_status
            return 0
        end
    end
    return -1
end

local function validate_msg_channal(ctx)
    if not ctx then
        log:error('validate: ctx is null.')
        return comp_code.InvalidFieldRequest
    end

    if ctx.chan_num ~= SYS_CHAN_NUM then
        log:error('validate: channel num(%s) invalid', ctx.chan_num)
        return comp_code.CommandNotAvailable
    end

    return comp_code.Success
end

local function parse_vpd_info(req)
    local sn, pn
    local card_vpd_info = req.VPDItemData
    if not card_vpd_info or #card_vpd_info < 1 then
        log:error('card_vpd_info is invalid or empty')
        return nil, nil
    end
    local cnt = card_vpd_info:byte(1)
    local index = 2
    for i = 1, cnt do
        if index + 1 > #card_vpd_info then
            log:error('Insufficient data to read key')
            break
        end
        local key = card_vpd_info:sub(index, index + 1)
        index = index + 2

        if index > #card_vpd_info then
            log:error('Insufficient data to read item length')
            break
        end
        local item_len = card_vpd_info:byte(index)
        index = index + 1

        if index + item_len - 1 > #card_vpd_info then
            log:error('Insufficient data to read item_data')
            break
        end
        local item_data = card_vpd_info:sub(index, index + item_len - 1)

        -- 处理bios上报时多余的空格字符
        local end_pos = item_data:find(' ')
        if end_pos then
            item_data = item_data:sub(1, end_pos - 1)
        end
        index = index + item_len
        if string.match(key, 'SN') then
            sn = item_data
        elseif string.match(key, 'PN') then
            pn = item_data
        end
    end

    return sn, pn
end

function c_device_service:write_pcie_vpd_info(req, ctx)
    local err_code = validate_msg_channal(ctx)
    if err_code ~= comp_code.Success then
        return err_code
    end

    if (req.BusNumber == UINT8_MAX and req.DeviceNumber == UINT8_MAX and
        req.FunctionNumber == UINT8_MAX) or req.BusNumber == 0 then
        log:error('write_pcie_vpd_info: BDF in msg is invalid')
        return comp_code.InvalidCommand
    end

    local sn, pn = parse_vpd_info(req)
    if not sn and not pn then
        return comp_code.InvalidCommand
    end

    local local_db_handle = self.reset_local_db
    local tbl = local_db_handle.PCIeVPDInfo

    local record = local_db_handle:select(tbl):where({DevBus = req.BusNumber,
    DevDevice = req.DeviceNumber, DevFunction = req.FunctionNumber}):first()

    if not record then
        local new_record = local_db_handle.PCIeVPDInfo({
            DevBus = req.BusNumber,
            DevDevice = req.DeviceNumber,
            DevFunction = req.FunctionNumber,
            SerialNumber = sn or '',
            PartNumber = pn or ''
        })
        new_record:save()
        log:notice('[PCIeVPDInfo] Add PCIeVPDInfo, DevBus=%s, DevDevice=%s, DevFunction=%s, \
            SerialNumber=%s, PartNumber=%s',
            new_record.DevBus, new_record.DevDevice, new_record.DevFunction, 
            new_record.SerialNumber, new_record.PartNumber)
        return comp_code.Success
    end

    record.SerialNumber = sn or ''
    record.PartNumber = pn or ''
    record:save()
    log:notice('[PCIeVPDInfo] Update PCIeVPDInfo, DevBus=%s, DevDevice=%s, DevFunction=%s, \
        SerialNumber=%s, PartNumber=%s',
        record.DevBus, record.DevDevice, record.DevFunction, 
        record.SerialNumber, record.PartNumber)
    return comp_code.Success
end

function c_device_service:init()
    self.service_manager:register('device', self)
end

function c_device_service:ctor(bus, db, reset_local_db, service_manager)
    self.bus = bus
    self.db = db
    self.reset_local_db = reset_local_db
    self.service_manager = service_manager
    self.pcie_device_list = {}
    self.pcie_card_list = {}
    self.ocp_card_list = {}
    self.eth_device_list = {}
    self.flash_checker_list = {}
    self.reduction_event_list = {}
    self.pcie_oob_mgmt_flag = false
    self.reduction_event_co = cmn.skynet.fork_loop({count = 0}, function()
        reduction_event_consumer(self)
    end)
    self.dev_prop_changed = mc_signal.new()
end

-- Test
c_device_service.pcie_and_ocp_task = pcie_and_ocp_task
c_device_service.update_flash_checker = update_flash_checker
c_device_service.update_init_abnormal_event = update_init_abnormal_event
c_device_service.get_latest_alarm_list = get_latest_alarm_list

return singleton(c_device_service)
