-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.


local class = require 'mc.class'
local log = require 'mc.logging'
local bs = require 'mc.bitstring'
local utils = require 'mc.utils'
local c_tasks = require 'mc.orm.tasks'
local mdb = require 'mc.mdb'
local context = require 'mc.context'
local c_acu = require 'unit_manager.class.unit.acu.acu'
local smc = require 'protocol.smc'
local i3c = require 'protocol.i3c'
local file_sec = require 'utils.file'
local cmn = require 'common'
local utils_core = require 'utils.core'
local vos = require 'utils.vos'
local cjson = require 'cjson'
local event = require 'infrastructure.event'
local skynet = require 'skynet'
local npu_enums = require 'unit_manager.class.unit.acu.npu_enums'

---@class NpuBoard: ACU @NPU板
local c_npu_board = class(c_acu)

local RET_ERR<const> = -1
local SMC_MCU_DUMP_ADDRESS_LEN <const> = 4
local SMC_MCU_DUMP_FILE_LEN <const> = 240 -- NPU模组MCU每次获取240字节的数据
local SMC_MCU_DUMP_DATA_LEN <const> = 244 -- 每帧由4字节的偏移和240字节的数据组成
local GET_DUMP_LEN_CMD <const> = 0x00004100
local SET_DUMP_ADDRESS_CMD <const> = 0x00004600
local GET_DUMP_DATA_CMD <const> = 0x00004900
local GET_BOARD_ID_CMD<const> = 0x500
local GET_BOARD_ID_LENGTH<const> = 2

local I3C_BUS_TYPE = 2
local I3C_MCU_DUMP_FILE_LEN<const> = 0x7f4  -- I3C每次获取2k-12的数据
local I3C_MCU_DUMP_DATA_LEN<const> = 0x7f8  -- 每帧由4字节的偏移和2k-12字节的数据组成

local response_data_bs <const> = bs.new([[<<
    offset:32,
    data/string
>>]])

local i3c_log_type_list = {
    ['npu_uart_log'] = 4,
    ['npu_log'] = 5
}

local smc_log_type_list = {
    ['maintaince_log'] = 1,
    ['operate_log'] = 2,
    ['error_log'] = 3,
    ['npu_uart_log'] = 4
}

local log_type_lists = {
    [0] = smc_log_type_list,
    [1] = smc_log_type_list,
    [2] = i3c_log_type_list
}

local i3c_log_max_len = {
    ['npu_uart_log'] = 0x200000,   -- npu日志最大2mb
    ['npu_log'] = 0x200000         -- 串口录音最大2mb
}

local smc_log_max_len = {
    ['maintaince_log'] = 0x19200, -- 0x18C00的flash+0x600的ram
    ['operate_log'] = 0x19000, -- 0x18C00的flash+0x400的ram
    ['error_log'] = 0x32000, -- 0x31800的flash+0x800的ram
    ['npu_uart_log'] = 0x60400 -- 0x60000的flash+0x400的ram
}

local log_max_lens = {
    [0] = smc_log_max_len,
    [1] = smc_log_max_len,
    [2] = i3c_log_max_len
}

local CHANNEL_TYPE = {
    [0] = smc,
    [1] = smc,
    [2] = i3c
}

function c_npu_board:task_update()
    self:task_update_mcu_version()
end

function c_npu_board:update_npuboard_time()
    local chip = self:get_prop('RefSMCChip')
    -- func=0x10, command=0x08, ms=1, rw=0, param=1, data_len=4
    local SET_TIME_CMD <const> = 0x40002201
    local data = string.pack('L', os.time()):sub(1,4)
    local bus_type = self:get_prop('BusType') or 0
    local ok, res 
    if bus_type == I3C_BUS_TYPE then
        ok, res = pcall(function ()
            chip:Write(context.new(), npu_enums.I3C_SET_TIME_CMD, data)
        end)
    else
        ok, res = pcall(smc.chip_blkwrite, chip, SET_TIME_CMD, data)
    end
    if not ok then
        log:error('Write (update npuboard time) failed. res: %s', res)
        return
    end
end

-- NPU模组SMC通信检测任务，记录维护日志
function c_npu_board:check_mcu_i2c_communication()
    local chip = self:get_prop('RefSMCChip')
    local position = self:get_prop('Position')
    local device_name = self:get_prop('DeviceName')
    local ok = pcall(smc.chip_blkread, chip, GET_BOARD_ID_CMD, GET_BOARD_ID_LENGTH)
    local debounce_val = ok and 0 or 1
    if self.i2c_error and self.i2c_debounce:get_debounced_val(debounce_val) == 0 then
        -- i2c访问恢复
        log:maintenance(log.MLOG_INFO, log.FC__PUBLIC_OK, 'Successfully to access I2C MCU on %s(%s).', position, device_name)
        self.i2c_error = false
    elseif not self.i2c_error and self.i2c_debounce:get_debounced_val(debounce_val) == 1 then
        -- i2c访问异常
        log:maintenance(log.MLOG_INFO, log.FC__PUBLIC_OK, 'Failed to access I2C MCU on %s(%s).', position, device_name)
        self.i2c_error = true
    end
end

local function get_mcu_dump_len(chip, index, bus_type)
    local cmd = GET_DUMP_LEN_CMD + index
    local channel = CHANNEL_TYPE[bus_type]
    local length = SMC_MCU_DUMP_ADDRESS_LEN
    local ok, rsp = pcall(channel.chip_blkread, chip, cmd, length)
    if not ok or not rsp then
        log:error("SMC get (dump max len) failed, error:%s", rsp)
        return
    end
    local data = bs.new([[<<len:32>>]]):unpack(rsp, true)
    return data.len
end

local function set_dump_start_offset(chip, index, offset, bus_type)
    log:info('set offset is %s', offset)
    local cmd = SET_DUMP_ADDRESS_CMD + index
    local channel = CHANNEL_TYPE[bus_type]
    local ok, rsp = pcall(channel.chip_blkwrite, chip, cmd, string.pack('L', offset))
    if not ok then
        log:debug("SMC set (dump start address) failed, error:%s", rsp)
        return
    end
    return ok
end

local function retry_set_dump_start_offset(chip, index, offset, bus_type)
    local ok
    for _ = 1, 10 do
        ok = set_dump_start_offset(chip, index, offset, bus_type)
        if ok then
            return
        else
            cmn.skynet.sleep(50) --设置失败等待0.5s
        end
    end
    error('set dump start offset failed')
end

local function get_mcu_dump_data(chip, index, bus_type)
    local cmd = GET_DUMP_DATA_CMD + index
    local channel = CHANNEL_TYPE[bus_type]
    local length = bus_type == I3C_BUS_TYPE and I3C_MCU_DUMP_DATA_LEN or SMC_MCU_DUMP_DATA_LEN
    local ok, rsp = pcall(channel.chip_blkread, chip, cmd, length)
    if not ok or not rsp then
        log:debug("SMC get (mcu dump data) failed, error:%s", rsp)
        return
    end
    return response_data_bs:unpack(rsp)
end

function c_npu_board:update_dump_data(print_data, file_path, log_type)
    -- 如果存在日志则需要删除
    local old_log_name
    if  vos.get_file_accessible(file_path) then
        for _, log_name in pairs(utils_core.dir(file_path)) do
            if string.match(log_name, log_type) then
                old_log_name = log_name
                break
            end
        end
    end
    local times_tamp = os.time()  -- 获取当前时间戳
    local date_str = os.date("%Y%m%d%H%M%S", times_tamp)  -- 将时间戳转换为年月日时分秒的格式
    local file_name = string.format('%s%s_%s.bin', file_path, log_type, date_str)
    local file, err = file_sec.open_s(file_name, 'w+')
    if not file then
        log:error('open %s failed, err: %s', log_type, err)
        return
    end
    utils.close(file, pcall(file.write, file, print_data))
    utils_core.chmod_s(file_name, utils.S_IRUSR | utils.S_IWUSR | utils.S_IRGRP)
    if old_log_name then
        utils.remove_file(file_path .. old_log_name)
    end
end

function c_npu_board:update_dump_info(log_type, file_path, bus_type)
    local chip = self:get_prop('RefSMCChip')
    local index = log_type_lists[bus_type][log_type]
    local len

    for _ = 1, 10 do
        len = get_mcu_dump_len(chip, index, bus_type)
        if len then
            break
        else
            cmn.skynet.sleep(50) --获取失败硬件建议等待0.5s
        end
    end
    if not len or len > log_max_lens[bus_type][log_type] then
        log:error('len is invalid,len: %s,name:%s', len, self.mds_obj.DeviceName)
        return
    end
    log:notice('get len is %s, name:%s', len, self.mds_obj.DeviceName)

    local offset = 0
    local result = {}
    local retries = 30
    local recv_data
    retry_set_dump_start_offset(chip, index, 0, bus_type)
    while offset < len and retries > 0 do
        if self.mds_obj.CurrentUpgradeStatus ~= 0 then
            -- 正在升级MCU时停止收集日志,等待1分钟
            cmn.skynet.sleep(6000)
            log:notice('mcu is upragde, pause collect, Name:%s', self.mds_obj.DeviceName)
            goto continue
        end
        recv_data = get_mcu_dump_data(chip, index, bus_type)
        -- 出现问题需要重新设置起始地址
        if not recv_data or offset ~= recv_data.offset then
            retries = retries - 1
            retry_set_dump_start_offset(chip, index, offset, bus_type)
            cmn.skynet.sleep(10)
        else
            local file_len = bus_type == I3C_BUS_TYPE and I3C_MCU_DUMP_FILE_LEN or SMC_MCU_DUMP_FILE_LEN
            table.insert(result, recv_data.data)
            offset = offset + file_len
            retries = 30
        end
        ::continue::
    end
    if retries <= 0 then
        log:error('[NpuBoard]npu dump is lack, len:%s, expected len is :%s', offset, len)
        return
    end
    local print_data = table.concat(result)
    self:update_dump_data(print_data, file_path, log_type)
end

function c_npu_board:update_dump_log()
    local file_path = string.format('/data/var/log/socboard/%s_%s/',
        self.mds_obj.DeviceName, self.mds_obj.Name)
        utils.mkdir_with_parents(file_path, utils.S_IRUSR | utils.S_IWUSR | utils.S_IRGRP)
    local ok, err
    local bus_type = self:get_prop('BusType') or 0
    for k, _ in pairs(log_type_lists[bus_type]) do
        ok, err = pcall(function ()
            self:update_dump_info(k, file_path, bus_type)
        end)
        if not ok then
            log:error('get dump log(%s) failed, error :%s', self.mds_obj.DeviceName, err)
        end
    end
    log:notice('get all dump log(%s) end', self.mds_obj.DeviceName)
end

function c_npu_board:on_dump_mcu_log_cb()
    local socboard_path = "/dev/shm/dump_info_tmp/dump_info/LogDump/socboard"
    utils.mkdir_with_parents(socboard_path, utils.S_IRUSR | utils.S_IWUSR | utils.S_IRGRP)
    local ret = file_sec.check_realpath_before_open_s(socboard_path)
    if ret ~= 0 then
        log:error('check socboard path failed, err:%s', ret)
        return
    end
    local log_path = string.format('/data/var/log/socboard/%s_%s',
        self.mds_obj.DeviceName, self.mds_obj.Name)
    ret = file_sec.check_realpath_before_open_s(log_path)
    if ret ~= 0 then
        log:error('check log path failed, Name: %s, err:%s', self.mds_obj.DeviceName, ret)
        return
    end
    vos.system_s("/bin/cp", "-r", log_path, socboard_path)
end

function c_npu_board:generate_fault_code(alarm_list)
    self.health = 0
    self.assert = 0
    self.fault_code = ''
    -- 通过 ComponentName 和 MessageArgs 里的 err_code 确定当前的告警码
    for _, alarm in pairs(alarm_list) do
        if not npu_enums.event_map_health_table[alarm.EventName] then
            log:debug('alarm.EventName not match') -- 不是范围内的告警
            goto continue
        end
        if not alarm.ComponentName or alarm.ComponentName ~= self.mds_obj.DeviceName then
            log:debug('alarm.ComponentName not match')
            goto continue
        end
        if not alarm.MessageArgs or alarm.MessageArgs == "" then
            log:debug('alarm.MessageArgs empty')
            goto continue
        end

        local is_ok, alarm_msg_args = pcall(cjson.decode, alarm.MessageArgs)
        if not is_ok then
            log:debug('alarm.MessageArgs decode failed, err: %s', alarm_msg_args)
            goto continue
        end
        -- 告警码为空
        if alarm_msg_args[#alarm_msg_args] == '' then
            log:debug('alarm.MessageArgs decode failed, err: %s', alarm_msg_args)
            goto continue
        end
        -- alarm_msg_args 最后一个元素是 err_code_str
        self.health = npu_enums.event_map_health_table[alarm.EventName]
        self.assert = 1
        self.fault_code = alarm_msg_args[#alarm_msg_args]
        break
        ::continue::
    end
    log:notice('Init %s health: %s, assert: %s, fault code: %s',
        self.mds_obj.DeviceName, self.health, self.assert, self.fault_code)
end

function c_npu_board:init_npu_board_health()
    local ok
    local alarm_list
    local access_success = false
    for i = 1, 10, 1 do
        ok, alarm_list = event.get_latest_alarm_list(self.bus)
        -- 第一次访问成功后在sleep 60s确保所有的告警上树
        if ok and access_success then
            log:notice('Get latest alarm list successfully, name: %s', self.mds_obj.DeviceName)
            break
        end
        if ok then
            skynet.sleep(5000)
            access_success = true
        end
        skynet.sleep(1000)
    end

    if not ok then
        log:error('Init npu board health failed, name: %s err: %s', self.mds_obj.DeviceName, alarm_list)
        return ok
    end
    self:generate_fault_code(alarm_list)
    return true
end

function c_npu_board:generate_npuboard_event(params)
    if params.assert == self.assert then
        log:info('%s assert not change, assert: %s', self.mds_obj.DeviceName, params.assert)
        return true
    end
    
    local param = {
        {'ComponentName', self.mds_obj.DeviceName},
        {'State', params.assert == 1 and 'true' or 'false'},
        {'EventKeyId', npu_enums.event_table[params.health]},
        {'MessageArgs', cjson.encode({self.mds_obj.DeviceName, '', self.mds_obj.Name, 'Error Code: ', params.fault_code})},
        {'SystemId', '1'},
        {'ManagerId', '1'},
        {'ChassisId', '1'},
        {'NodeId', ''},
        {'SubjectType', '92'}
    }

    local ok, ret = pcall(event.generate_event, param)
    if not ok then
        log:info('%s add event %s failed, res: %s', self.mds_obj.DeviceName, npu_enums.event_table[params.health], ret)
        return false
    end

    self.assert = params.assert
    self.health = params.assert == 0 and 0 or params.health
    self.fault_code = params.assert == 0 and '' or params.fault_code
    return true
end

function c_npu_board:monitor_npu_board_health(channel)
    -- 升级时不做轮询
    local ok, obj = pcall(mdb.get_object, self.bus, self:get_prop('RefSMCChip').path, 'bmc.kepler.Chip')
    if not ok then
        log:info('%s get chip object failed, error: %s', self.mds_obj.DeviceName, obj)
        return
    end
    if obj.LockStatus ~= 0 then
        log:info('%s chip lock status is %s', self.mds_obj.DeviceName, obj.LockStatus)
        return
    end
    local health_info = channel.get_npu_board_health(self:get_prop('RefSMCChip'))
    local params = {}
    if not health_info then
        -- 访问失败，直接退出，监控Chip的HealthStatus上报告警
        log:info('%s get npu board health failed.', self.mds_obj.DeviceName)
        return
    end
    if health_info.health == 0 then
        -- 清除告警
        params.health = self.health
        params.fault_code = self.fault_code
        params.assert = 0
        self:generate_npuboard_event(params)
        return
    end
    local fault_code = channel.get_npuboard_error_code(self:get_prop('RefSMCChip'), health_info)
    -- i3c访问连续的地址，理论上不会存在健康状态访问成功，但是告警码访问失败的场景
    if not fault_code then
        -- 访问失败，直接退出，监控Chip的HealthStatus上报告警
        log:info('%s get npu board fault code failed.', self.mds_obj.DeviceName)
        return
    end

    -- 告警等级或告警码发生变化都需要重新上报告警
    if health_info.health ~= self.health or fault_code ~= self.fault_code then
        -- 清除告警
        params.health = self.health
        params.fault_code = self.fault_code
        params.assert = 0
        self:generate_npuboard_event(params)
        -- 重新生成告警
        params.health = health_info.health
        params.fault_code = fault_code
        params.assert = 1
        self:generate_npuboard_event(params)
    end
end

function c_npu_board:update_npuboard_health()
    local bus_type = self:get_prop('BusType') or 0
    -- 当前只支持I3C的告警检测
    if bus_type ~= I3C_BUS_TYPE then
        return
    end
    -- 告警init失败，说明event模块有异常，此时告警检测无意义
    if not self:init_npu_board_health() then
        return
    end

    local channel = CHANNEL_TYPE[bus_type]
    c_tasks.get_instance():new_task(self.mds_obj.DeviceName .. 'monitor health task'):loop(function(tasks)
        self:monitor_npu_board_health(channel)
    end):set_timeout_ms(30000)
end

return c_npu_board