-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at:
--         http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.

local class = require 'mc.class'
local log = require 'mc.logging'
local utils = require 'mc.utils'
local c_tasks = require 'tasks'
local singleton = require 'mc.singleton'
local c_storageconfig = require 'storageconfig.storageconfig_object'
local c_controller_service = require 'controller.controller_collection'
local add_event = require 'add_event'
local common_def = require 'common_def'
local utils_core = require 'utils.core'
local file_sec = require 'utils.file'
local sml = require 'sml'

local c_phy_biterr = class()

local SML_MAX_SAS_PHY_PER_CTRL <const> = 40
local SM_SOFT_RAID_CONTROLLER_TYPE_ID <const> = 0xC0
local SML_MAX_SAS_PHY_PER_EXPANDER <const> = 100
local PHY_BITERR_LOG_BASE_PATH <const> = '/data/var/log/storage/phy'
local PHY_RAID_CARD <const> = 0

local TASK_UPDATE <const> = 'phy_biterr_update'

function c_phy_biterr:ctor()
    self.phy_biterr_last_diag_time = 0
    self.diagnose_list = {}
    self.config = nil
    self.config_obj = nil
end

function c_phy_biterr:init()
    self:start()
end

function c_phy_biterr:init_diag_info_data(type, data)
    local res = {}

    if type == 'invalid_cnt' then
        for i = 0, SML_MAX_SAS_PHY_PER_EXPANDER - 1 do
            res[i] = 0
        end
    elseif type == 'err_status' then
        for i = 0, SML_MAX_SAS_PHY_PER_EXPANDER - 1 do
            res[i] = {
                ['sel_reported'] = 0,         -- 标记该PHY在重启以来是否产生过误码增长过快的SEL事件
                ['err_cnt_changed'] = data & (1 << 30),       -- 与上次计数相比，误码计数是否有变化
                ['err_cnt_change_dir'] = data & (1 << 29),     -- 与上次相比，误码计数变化方向，1降低，0增加
                ['increased_over_thresh'] = data & (1 << 28),  -- 与上次相比，误码增加是否超过门限
                ['phy_err_status_reserved'] = data & 0xFFF
            }
        end
    elseif type == 'err_cnt_info' then
        for i = 0, SML_MAX_SAS_PHY_PER_EXPANDER - 1 do
            res[i] = {
                ['phy_id'] = 0,
                ['invalid_dword_count'] = 0,
                ['loss_dword_sync_count'] = 0,
                ['phy_reset_problem_count'] = 0,
                ['running_disparity_error_count'] = 0
            }
        end
    end

    return res
end

function c_phy_biterr:generate_new_diag_info()
    return {
        ['obj_index'] = 0,
        ['max_raid_phy_count'] = 0, -- RAID卡本身PHY个数，在csr中配置
        -- PHY误码计数
        ['raid_phy'] = {
            ['base_data_obtained'] = 0,   -- 是否至少已经获取到一次有效误码计数
            ['first_valid_data'] = 0,     -- 第一次获取到有效数据的标记
            ['failed_cnt'] = 0,           -- 在已经获取过有效数据的情况下，对连续误码更新失败计数，更新成功后将该数据清0
            ['failed_cnt_bak'] = 0,       -- 数据更新成功时备份记录更新失败的次数，备份failed_cnt的值
            ['phy_count'] = 0,
            ['err_status'] = self:init_diag_info_data('err_status', 0),      -- PHY误码分析诊断状态
            ['err_cnt_info'] = self:init_diag_info_data('err_cnt_info', 0),  -- PHY误码数据
            ['invalid_cnt'] = self:init_diag_info_data('invalid_cnt', 0)     -- 各phy采集到无效数据的次数统计，采集到有效数据则清零
        },
        ['raid_mock_mgnt_info'] = {
            ['file_name'] = '',       -- 模拟输入文件名称
            ['mock_enabled'] = 0,    -- 文件模拟输入是否已经开启
            ['record_index'] = 0,    -- 当前从模拟文件中读取的记录序号
            ['mock_first_time'] = true -- 模拟开启命令刚执行标志，从模拟文件获取一次数据后该标志清零
        }
    }
end

-- 检查RAID控制器对象以便确定哪些可以做存储诊断处理
function c_phy_biterr:diagnose_init()
    local ok, controllers = pcall(function ()
        return c_controller_service.get_instance():get_all_controllers()
    end)

    if not ok then
        return
    end

    for _, v in pairs(controllers) do
        -- 过滤掉软RAID
        if v.TypeId == SM_SOFT_RAID_CONTROLLER_TYPE_ID then
            goto continue
        end

        for _, iv in pairs(self.diagnose_list) do
            if iv.obj_index == v.Id then
                goto continue
            end
        end

        local diag_info = self:generate_new_diag_info()
        diag_info.obj_index = v.Id
        diag_info.max_raid_phy_count = #v:get_phy_children()

        -- 节点追加到链表中
        table.insert(self.diagnose_list, diag_info)
        ::continue::
    end
end

function c_phy_biterr:start()
    log:notice('Start update phy biterr.')
    c_tasks.get_instance():new_task(TASK_UPDATE):loop(function(task)
        if c_storageconfig.get_instance().power_on == 0 then
            -- 上次PHY误码诊断处理时间清零
            self.phy_biterr_last_diag_time = 0
            self.diagnose_list = {}
            goto continue
        end
        self.config = c_storageconfig.get_instance()
        self.config_obj = c_storageconfig.get_instance().obj

        self:diagnose_init()

        self:phy_biterr_process()
        ::continue::
    end):set_timeout_ms(10000)
end

-- PHY误码处理主函数
function c_phy_biterr:phy_biterr_process()
    -- 是否需要误码诊断
    if not self.config_obj or not self.config_obj.PhyErrorEnabled then
        return
    end

    -- 带外管理是否正常运行
    if not self.config:support_oob() then
        return
    end

    --检查故障诊断的时间是否到期
    if not self:check_phy_biterr_diagnose_startup() then
        return
    end

    self.phy_biterr_last_diag_time = math.floor(os.time())

    for _, v in pairs(self.diagnose_list) do
        -- 如果检测到系统下电了，则直接跳过剩下的RAID卡/Expander PHY误码信息获取，提前结束本次误码处理
        if c_storageconfig.get_instance().power_on == 0 then
            return
        end

        local pre_info = v
        local cur_info = self:generate_new_diag_info()
        local ret = self:phy_biterr_get(pre_info, cur_info)
        if not ret then
            self:phy_biterr_get_failed_cnt(pre_info)
            goto continue
        end

        -- 误码数据分析
        self:phy_biterr_analyze_raid(pre_info, cur_info)

        -- 对RAID卡检查PHY误码数据是否有变化，有变化记录到日志文件中
        self:phy_biterr_record_raid(pre_info)

        -- 检查是否误码增长过快并产生SEL事件上报
        self:phy_biterr_sel_generate(pre_info)
        ::continue::
    end
end

-- 记录RAID PHY误码变化到日志文件中
function c_phy_biterr:phy_biterr_record_raid(p_phyerr)
    if not p_phyerr then
        return
    end

    -- 获取raid卡部件名称
    local ok, ctrl = pcall(function ()
        return c_controller_service.get_instance():get_by_controller_id(p_phyerr.obj_index)
    end)

    if not ok then
        return
    end
    local raid_card_name = string.gsub(ctrl.DeviceName, " ", "_")
    local file_path = PHY_BITERR_LOG_BASE_PATH .. '/' .. raid_card_name

    -- 检查RAID卡相关的日志目录是否存在，不存在则创建
    if not utils_core.is_dir(file_path) then
        if not utils.mkdir_with_parents(file_path,
            utils.S_IRWXU | utils.S_IRGRP | utils.S_IXGRP | utils.S_IROTH | utils.S_IXOTH) then
            log:error('create phy log dir failed')
        end
   end

   file_path = PHY_BITERR_LOG_BASE_PATH .. '/' .. raid_card_name .. '/' .. raid_card_name .. '_PHY_Error_Count'

    -- RAID卡PHY误码处理
    -- 一个记录周期第一次获取到有效数据，需要写入日志文件
    if p_phyerr.raid_phy.first_valid_data == 1 then
        -- 修改文件名称
        self:check_log_file_rotate(file_path, 'csv', self.config_obj.MaxPhyErrorLogFileRotationCount)

        -- 创建当前要写入的文件， 写入初始数据
        local log_file_path_name = file_path .. '.csv'
        self:phy_log_create_file(log_file_path_name, p_phyerr, PHY_RAID_CARD)
        p_phyerr.raid_phy.first_valid_data = 0
    else
        -- 文件不存在则直接创建文件，并写入第一次基准数据.
        local log_file_path_name = file_path .. '.csv'
        if not utils.realpath(log_file_path_name) then
            self:phy_log_create_file(log_file_path_name, p_phyerr, PHY_RAID_CARD)
        else
            -- 检测Phy误码是否有变化
            for i = 0, p_phyerr.raid_phy.phy_count - 1 do
                if p_phyerr.raid_phy.err_status[i].err_cnt_changed == 1 then
                    self:phy_log_insert_record(log_file_path_name, p_phyerr.raid_phy.err_cnt_info[i])
                    p_phyerr.raid_phy.err_status[i].err_cnt_changed = 0
                end
            end
        end
    end
end

-- 找到PHY ID的最后一条记录起始位置
function c_phy_biterr:phy_log_find_specified_record(file_path, phy_id)
    local first_record = -1
    local last_record = -1
    local record_num = 0
    local file = file_sec.open_s(file_path, "r")
    if not file then
        return
    end
    local content = utils.close(file, pcall(file.read, file, "*all"))
    if not content then
        return
    end
    local k = 1
    for line in string.gmatch(content, "[^\n]+") do
        local arr = {}
        for word in line:gmatch("%S+") do
            table.insert(arr, word)
        end

        if arr[1] == string.format('%s,', phy_id) then
            if record_num == 0 then
                first_record = k
            end
            record_num = record_num + 1
            last_record = last_record > k and last_record or k
        end
        k = k + 1
    end

    return first_record, last_record, record_num
end

-- 在文件中特定行插入内容
function c_phy_biterr:append_new_line(file_path, insert_pos, str)
    local file = file_sec.open_s(file_path, "r")
    if not file then
        return
    end
    local content = file:read("*all")
    if not content then
        file:close()
        return
    end
    local lines = {}
    for line in string.gmatch(content, "[^\n]+") do
        table.insert(lines, line)
    end
    table.insert(lines, insert_pos, str)
    local new_content = table.concat(lines, "\n")
    file:close()
    file = file_sec.open_s(file_path, "w+")
    file:write(new_content)
    file:close()
end

-- 删除文件中特定行
function c_phy_biterr:delete_specific_line(file_path, del_pos)
    local file = file_sec.open_s(file_path, "r")
    if not file then
        return
    end
    local content = file:read("*all")
    if not content then
        file:close()
        return
    end
    local lines = {}
    for line in string.gmatch(content, "[^\n]+") do
        table.insert(lines, line)
    end
    table.remove(lines, del_pos)
    local new_content = table.concat(lines, "\n")
    file:close()
    file = file_sec.open_s(file_path, "w+")
    file:write(new_content)
    file:close()
end

function c_phy_biterr:phy_log_insert_record(file_path, p_phy)
    -- 找到PHY ID的第一条、最后一条以及条数
    local fir_recored, record_pos, record_num = self:phy_log_find_specified_record(file_path, p_phy.phy_id)
    local str = string.format('%-10s%-25s%-25s%-25s%-25s%s',
        string.format('%s,', p_phy.phy_id),
        string.format('%s,', os.date("%Y-%m-%d %H:%M:%S", os.time())),
        string.format('%s,', p_phy.invalid_dword_count),
        string.format('%s,', p_phy.loss_dword_sync_count),
        string.format('%s,', p_phy.phy_reset_problem_count),
        string.format('%s,', p_phy.running_disparity_error_count))

    if record_pos > 0 then
        -- 在最后一条记录的位置后插入
        self:append_new_line(file_path, record_pos + 1, str)

        -- 超出最大记录条数，删除第一条数据
        if record_num >= self.config_obj.PhyErrorMaxRecord then
            self:delete_specific_line(file_path, fir_recored)
        end
    else
        -- 没有找到PHY误码记录，直接添加到最后
        local file = file_sec.open_s(file_path .. '\n', 'a') -- 以追加模式写入
        utils.close(file, pcall(file.write, file, str))
    end
end

-- 创建日志文件文件并写入第一条日志
function c_phy_biterr:phy_log_create_file(log_file_name, p_phyerr, phy_type)
    -- 文件不存在则创建文件，已存在文件则直接覆盖
    local f_log = file_sec.open_s(log_file_name, 'w+')
    if not f_log then
        log:error('create file failed')
        return
    end

    utils.safe_close_file(f_log, function()
        -- 写入数据， 文件头 + 第一次基准数据
        self:phy_log_write_header(f_log, p_phyerr, phy_type)
        self:phy_log_write_base_data(f_log, p_phyerr, phy_type)
    end)
end

-- 向指定文件中写入PHY误码日志文件头
-- 文件格式示例：
--[[
    RAID卡PHY误码示例
    RAID Card Information:
    Component Name       : RAID Card1
    Board Serial Number  : 0221123ASDBASD
    PHY error statistics :
    PHY#, Timestamp, InvalidDwordCount, LossDwordSyncCount, PhyResetProblemCount, RunningDisparityErrorCount
    0,    2017-11-20 10:30:00,       0,                  0,                    0,                        10,
    0,    2017-11-20 14:30:00,       1,                  2,                    1,                        20,
    0,    2017-11-21 18:00:00,       2,                  4,                    1,                        20,
]]--
function c_phy_biterr:phy_log_write_header(fp, p_phyerr, phy_type)
    if not fp or not p_phyerr then
        return
    end

    local ok, ctrl = pcall(function ()
        return c_controller_service.get_instance():get_by_controller_id(p_phyerr.obj_index)
    end)

    if not ok then
        return
    end

    if phy_type == PHY_RAID_CARD then
        -- 写入文件头
        fp:write('RAID Card Information:\n')
        fp:write('Component Name       : ' .. ctrl.DeviceName .. '\n')
        fp:write('Product Name         : ' .. ctrl.DeviceName .. '\n')
        fp:write('Board Serial Number  : ' .. ctrl.SerialNumber .. '\n')
        fp:write('PHY error statistics :\n')
        fp:write('PHY#,     Timestamp,               InvalidDwordCount,       LossDwordSynchCount,     ' ..
            'PhyResetProblemCount,    RunningDisparityErrorCount\n')
    end
end

--向指定文件中写入第一次误码数据记录
function c_phy_biterr:phy_log_write_base_data(fp, p_phyerr, phy_type)
    if not fp or not p_phyerr then
        return
    end

    -- 获取当前时间戳字符串
    local cur_tm_stamp = os.date("%Y-%m-%d %H:%M:%S", os.time())
    if phy_type == PHY_RAID_CARD then
        -- 写入第一次RAID卡PHY误码数据
        for i = 0, p_phyerr.raid_phy.phy_count - 1 do
            local phy = p_phyerr.raid_phy.err_cnt_info[i]
            fp:write(string.format('%-10s%-25s%-25s%-25s%-25s%s\n',
                string.format('%s,', phy.phy_id),
                string.format('%s,', cur_tm_stamp),
                string.format('%s,', phy.invalid_dword_count),
                string.format('%s,', phy.loss_dword_sync_count),
                string.format('%s,', phy.phy_reset_problem_count),
                string.format('%s,', phy.running_disparity_error_count)))
        end
    end
end

-- 重命名有相同base name的同一类日志文件名
function c_phy_biterr:check_log_file_rotate(base_name, ext_name, max_log_num)
    local del_index = max_log_num - 1 --最后一个文件也需要删除
    local file_name = string.format('%s.%s.%s', base_name, del_index, ext_name)
    -- 删除掉大于rotate_num的已存在文件，即后缀.n大于等于max_log_num的文件
    while utils.realpath(file_name) do
        utils.remove_file(file_name)
        del_index = del_index + 1
        file_name = string.format('%s.%s.%s', base_name, del_index, ext_name)
    end

    -- next_unused_seq_num == 0 ? 不需要处理
    -- next_unused_seq_num == 1 ~ max_log_num - 1  文件名序号改为1 ~ max_log_num - 1
    -- next_unused_seq_num == max_log_num ? 删除MAX_PHY_LOG_FILE_NUM - 1，文件名序号改为1 ~ max_log_num - 2
    local next_unused_seq_num = 0

    -- 检查系列日志文件是否存在，从序号低的开始检查，检查到不存在指定的日志文件时停止
    for i = 0, max_log_num - 1 do
        local mid = (i == 0) and '.' or string.format('.%s.', i)
        file_name = string.format('%s%s%s', base_name, mid, ext_name)
        if not utils.realpath(file_name) then
            next_unused_seq_num = i
            break
        end
    end

    local new_name = ''
    -- 先从序号最高未被占用的文件名名称开始改名，以避免修改名名称后存在冲突
    while next_unused_seq_num > 0 do
        if next_unused_seq_num == max_log_num then
            goto continue
        elseif next_unused_seq_num == 1 then
            file_name = base_name .. '.' .. ext_name
            new_name = base_name .. '.1.' .. ext_name
        else
            file_name = base_name .. '.' .. (next_unused_seq_num - 1) .. '.' .. ext_name
            new_name = base_name .. '.' .. next_unused_seq_num .. '.' .. ext_name
        end

        os.rename(file_name, new_name)
 
        ::continue::
        next_unused_seq_num = next_unused_seq_num - 1
    end
end

-- 检查PHY误码分析结果，产生SEL事件记录
function c_phy_biterr:phy_biterr_sel_generate(p_phyerr)
    if not p_phyerr then
        return
    end

    -- 获取sel重复上报限制标志
    local sel_report_unlimit = self.config_obj.PhyErrorSelUnlimitFlag

    local ok, controller = pcall(function ()
        return c_controller_service.get_instance():get_by_controller_id(p_phyerr.obj_index)
    end)

    if not ok then
        return
    end

    -- RAID卡误码增长过快处理
    if p_phyerr.raid_phy.base_data_obtained == 1 then
        local phy_count = (p_phyerr.raid_phy.phy_count > SML_MAX_SAS_PHY_PER_EXPANDER) and
            SML_MAX_SAS_PHY_PER_EXPANDER or p_phyerr.raid_phy.phy_count

        -- PHY误码增长超过门限并且还没有上报SEL事件
        for i = 0,  phy_count - 1 do
            local sel_reported = p_phyerr.raid_phy.err_status[i].sel_reported == 1
            if p_phyerr.raid_phy.err_status[i].increased_over_thresh == 1 and
                (not sel_reported or sel_report_unlimit) then
                -- 设置告警信息，产生告警事件
                local desc = string.format('RAID card (%s) PHY%s bit error increased too fast.',
                    controller.DeviceName, i)
                -- 确保产生之后需要消除
                -- 第一次产生sel或不限制sel次数
                add_event.get_instance().generate_phy_increase_too_fast('false', desc) -- 防止之前存在同名的告警
                add_event.get_instance().generate_phy_increase_too_fast('true', desc)
                add_event.get_instance().generate_phy_increase_too_fast('false', desc)
                p_phyerr.raid_phy.err_status[i].sel_reported = 1
            end
        end
    end
end

-- 对RAID卡两次误码信息进行分析
function c_phy_biterr:phy_biterr_analyze_raid(pre_info, cur_info)
    pre_info.raid_phy.phy_count = (cur_info.raid_phy.phy_count > SML_MAX_SAS_PHY_PER_CTRL) and
        SML_MAX_SAS_PHY_PER_CTRL or cur_info.raid_phy.phy_count

    -- 设置数据有效性及是否第一次获取到数据的标记,数据获取失败计数更新
    if pre_info.raid_phy.phy_count ~= 0 then
        if pre_info.raid_phy.base_data_obtained == 0 then
            pre_info.raid_phy.base_data_obtained = 1
            pre_info.raid_phy.first_valid_data = 1

            -- 第一次获取到数据，先保存PHY id
            for i = 0, pre_info.raid_phy.phy_count - 1 do
                pre_info.raid_phy.err_cnt_info[i].phy_id = cur_info.raid_phy.err_cnt_info[i].phy_id
            end
        end
        pre_info.raid_phy.failed_cnt_bak = pre_info.raid_phy.failed_cnt
        pre_info.raid_phy.failed_cnt = 0
    else -- PHY个数为0，当做误码数据获取失败处理
        if pre_info.raid_phy.base_data_obtained ~= 0 then
            pre_info.raid_phy.failed_cnt = pre_info.raid_phy.failed_cnt + 1
        end
    end

    self:phy_biterr_update(pre_info.raid_phy, cur_info.raid_phy)
end

function c_phy_biterr:phy_biterr_update(pre_raid_phy, cur_raid_phy)
    for i = 0, pre_raid_phy.phy_count - 1 do
        for j = 0, pre_raid_phy.phy_count - 1 do
            if pre_raid_phy.err_cnt_info[i].phy_id ~= cur_raid_phy.err_cnt_info[j].phy_id then
                goto continue
            end
 
            -- 误码比较，第一次获取到误码数据时，不需要比较
            if pre_raid_phy.first_valid_data == 1 then
                pre_raid_phy.err_status[i] = {}
                pre_raid_phy.err_cnt_info[i] = cur_raid_phy.err_cnt_info[j]
                pre_raid_phy.invalid_cnt[i] = 0
                break
            end

            -- 获取无效数据后直接丢弃不进行比较
            if not self:phy_biterr_check(pre_raid_phy.err_cnt_info[i],
                cur_raid_phy.err_cnt_info[j], pre_raid_phy.invalid_cnt[i]) then
                break
            end

            self:phy_biterr_compare(pre_raid_phy.err_cnt_info[i], cur_raid_phy.err_cnt_info[j],
                pre_raid_phy.failed_cnt_bak + pre_raid_phy.invalid_cnt[i], pre_raid_phy.err_status[i])

            -- 数据正常比较结束后清空该phy的无效数据计数
            pre_raid_phy.invalid_cnt[i] = 0
            pre_raid_phy.err_cnt_info[i] = cur_raid_phy.err_cnt_info[j]

            ::continue::
        end
    end
end

-- 比较一个PHY的误码值，返回比较结果
function c_phy_biterr:phy_biterr_compare(p_last, p_current, last_failed_cnt, p_status)
    local delta = {[0] = 0, [1] = 0, [2] = 0, [3] = 0} -- 误码差值
    local is_mins = {[0] = 0, [1] = 0, [2] = 0, [3] = 0} -- 误码差值符号

    if not p_last or not last_failed_cnt or not p_status then
        return
    end

    -- 分析4个误码数据的差值
    local invalid_dword_count_diff_val = p_current.invalid_dword_count - p_last.invalid_dword_count
    delta[0] = math.abs(invalid_dword_count_diff_val)
    is_mins[0] = invalid_dword_count_diff_val >= 0 and 0 or 1

    local loss_dword_sync_count_diff_val = p_current.loss_dword_sync_count - p_last.loss_dword_sync_count
    delta[1] = math.abs(loss_dword_sync_count_diff_val)
    is_mins[1] = loss_dword_sync_count_diff_val >= 0 and 0 or 1

    local phy_reset_problem_count_diff_val = p_current.phy_reset_problem_count - p_last.phy_reset_problem_count
    delta[2] = math.abs(phy_reset_problem_count_diff_val)
    is_mins[2] = phy_reset_problem_count_diff_val >= 0 and 0 or 1

    local running_disparity_error_count_diff_val =
        p_current.running_disparity_error_count - p_last.running_disparity_error_count
    delta[3] = math.abs(running_disparity_error_count_diff_val)
    is_mins[3] = running_disparity_error_count_diff_val >= 0 and 0 or 1

    -- 设置分析结果，误码是否有变化
    p_status.err_cnt_changed = (delta[0] ~= 0 or delta[1] ~= 0 or delta[2] ~= 0 or delta[3] ~= 0) and 1 or 0

    -- 设置分析结果，误码是否有降低的情况存在
    p_status.err_cnt_change_dir = (is_mins[0] ~= 0 or is_mins[1] ~= 0 or is_mins[2] ~= 0 or is_mins[3] ~= 0) and 1 or 0

    -- 设置分析结果，误码正常是否超过产生SEL记录的门限
    local calculate_inteval_times = last_failed_cnt + 1
    local threshold = self.config_obj.PhyErrorThreshold

    if calculate_inteval_times == 0 then
        p_status.increased_over_thresh = 0
        return
    end

    if (threshold < (delta[0] // calculate_inteval_times) and is_mins[0] == 0) or
            (threshold < (delta[1] // calculate_inteval_times) and is_mins[1] == 0) or
            (threshold < (delta[2] // calculate_inteval_times) and is_mins[2] == 0) or
            (threshold < (delta[3] // calculate_inteval_times) and is_mins[3] == 0) then
        p_status.increased_over_thresh = 1
    else
        p_status.increased_over_thresh = 0
    end
end

--  smlib获取Raid卡误码统计只累加不减少，因此统计值减少意味着本轮数据无效，即全0等场景
--  返回值： TRUE, 本轮数据有效；FALSE，本轮数据无效，直接丢弃
function c_phy_biterr:phy_biterr_check(p_last_err, p_current_err, p_invalid_cnt)
    if p_current_err.invalid_dword_count < p_last_err.invalid_dword_count or
        p_current_err.loss_dword_sync_count < p_last_err.loss_dword_sync_count or
        p_current_err.phy_reset_problem_count < p_last_err.phy_reset_problem_count or
        p_current_err.running_disparity_error_count < p_last_err.running_disparity_error_count then
        p_invalid_cnt = p_invalid_cnt + 1
        if p_invalid_cnt == 1 then
            log:error('invalid phyid=%s, last=%s,%s,%s,%s, current=%s,%s,%s,%s.', p_last_err.phy_id,
                p_last_err.invalid_dword_count, p_last_err.loss_dword_sync_count,
                p_last_err.phy_reset_problem_count, p_last_err.running_disparity_error_count,
                p_current_err.invalid_dword_count, p_current_err.loss_dword_sync_count,
                p_current_err.phy_reset_problem_count, p_current_err.running_disparity_error_count
            )
        end
        return false
    end
    return true
end

-- 从文件中获取指定PHY id的误码记录条数
function c_phy_biterr:phy_log_get_record_num(file_name, phy_id)
    local record_num = 0
    local pre_fix = phy_id .. ','

    -- 匹配以PhyId开头的内容
    local file = file_sec.open_s(file_name, "r")
    if not file then
        return
    end
    local content = utils.close(file, pcall(file.read, file, "*all"))

    if not content then
        return
    end
    for line in string.gmatch(content, "[^\n]+") do
        if string.find(line, "^" .. pre_fix) then
            record_num = record_num + 1
        end
    end

    return record_num
end

-- 从模拟文件中获取误码信息
function c_phy_biterr:phy_biterr_get_from_file(p_diag_info)
    if not self.config_obj or not self.config_obj.FileMockEnabled or
        not utils.realpath(p_diag_info.raid_mock_mgnt_info.file_name) then
        return
    end

    -- 获取误码记录的条数
    local record_num = self:phy_log_get_record_num(p_diag_info.raid_mock_mgnt_info.file_name, 0)
    if record_num == 0 then
        return
    end

    -- 获取PHY的个数
    local max_phy_id = self:phy_log_get_max_phy_id(p_diag_info.raid_mock_mgnt_info.file_name)
    if max_phy_id == common_def.INVALID_U8 then
        return
    end

    max_phy_id = max_phy_id + 1
    p_diag_info.raid_phy.phy_count = max_phy_id > SML_MAX_SAS_PHY_PER_CTRL and SML_MAX_SAS_PHY_PER_CTRL or max_phy_id

    -- 验证从文件读取的记录序号
    if p_diag_info.raid_mock_mgnt_info.record_index >= record_num then
        p_diag_info.raid_mock_mgnt_info.record_index = 0
    end

    -- 获取误码数据
    for i = 0, p_diag_info.raid_phy.phy_count - 1 do
        if not self:phy_log_get_specified_record(p_diag_info, i) then
            return
        end
    end

    p_diag_info.raid_mock_mgnt_info.record_index = p_diag_info.raid_mock_mgnt_info.record_index + 1
    p_diag_info.raid_mock_mgnt_info.mock_first_time = false

    -- raid卡误码从文件读取失败，将PHY个数置为0
    if p_diag_info.raid_phy.phy_count ~= 0 then
        p_diag_info.raid_mock_mgnt_info.phy_count = 0
    end
end

-- 从指定PHY id的日志中中获取保存的误码信息
function c_phy_biterr:phy_log_get_specified_record(p_diag_info, phy_id)
    local file_name = p_diag_info.raid_mock_mgnt_info.file_name
    local record_idx = p_diag_info.raid_mock_mgnt_info.record_index
    local p_phy = p_diag_info.raid_phy.err_cnt_info[phy_id]

    if not file_name then
        return
    end

    -- 获取日志中Phy误码记录的次数
    local record_num = self:phy_log_get_record_num(file_name, phy_id)
    if record_idx > record_num then
        return
    end

    local target_line = self:phy_log_buf_find_specified_record(phy_id, file_name, record_idx)
    if not target_line then
        return
    end

    -- 分解数据
    local data = {}
    for s in string.gmatch(target_line, "([^,]+)") do
        table.insert(data, tonumber(s))
    end

    log:notice('mock data is %s, %s, %s, %s, %s, record index is %s',
        data[1], data[2], data[3], data[4], data[5], record_idx)
    p_phy.phy_id = data[1]
    p_phy.invalid_dword_count = data[2]
    p_phy.loss_dword_sync_count = data[3]
    p_phy.phy_reset_problem_count = data[4]
    p_phy.running_disparity_error_count = data[5]
    p_diag_info.raid_phy.err_cnt_info[phy_id] = p_phy

    return true
end

-- 找到指定PHY id的日志中指定序号记录的起始位置
function c_phy_biterr:phy_log_buf_find_specified_record(phy_id, file_name, record_idx)
    local pre_fix = phy_id .. ','
    local index = 0

    -- 匹配以PhyId开头的内容
    local file = file_sec.open_s(file_name, "r")
    if not file then
        return
    end
    local content = utils.close(file, pcall(file.read, file, "*all"))

    if not content then
        return
    end
    for line in string.gmatch(content, "[^\n]+") do
        if string.find(line, "^" .. pre_fix) then
            if index == record_idx then
                return line
            end
            index = index + 1
        end
    end

    return nil
end

-- 从文件中获取最大的PHY id
function c_phy_biterr:phy_log_get_max_phy_id(file_name)
    local invalid_phy_id = common_def.INVALID_U8
    local max_id = -1

    -- 匹配以PhyId开头的内容
    local file = file_sec.open_s(file_name, "r")
    if not file then
        return
    end
    local content = utils.close(file, pcall(file.read, file, "*all"))

    if not content then
        return
    end
    for line in string.gmatch(content, "[^\n]+") do
        local match_str = string.match(line, "%d+,")
        if match_str then
            local num = tonumber(string.match(line, "^%d+"))
            max_id = num > max_id and num or max_id
        end
    end

    return (max_id == -1) and invalid_phy_id or max_id
end

-- 实时获取RAID卡的PHY错误情况
function c_phy_biterr:phy_biterr_get(pre_info, cur_info)
    -- 同步RAID卡的标识变量
    cur_info.obj_index = pre_info.obj_index
    cur_info.max_raid_phy_count = pre_info.max_raid_phy_count

    -- 检查是否有模拟文件开关打开
    if self.config_obj and self.config_obj.FileMockEnabled then
        if pre_info.raid_mock_mgnt_info.mock_first_time then
            pre_info.raid_mock_mgnt_info.record_index = 0
        end

        cur_info.raid_mock_mgnt_info = pre_info.raid_mock_mgnt_info
        self:phy_biterr_get_from_file(cur_info)

        -- 同步回误码模拟信息
        pre_info.raid_mock_mgnt_info = cur_info.raid_mock_mgnt_info
        return true
    end

    if self.config:support_oob() then
        return self:phy_biterr_get_from_oob(cur_info)
    end

    return false
end

function c_phy_biterr:get_ctrl_sas_phy_err_count(ctrl_phy_err)
    local ok, phy_arr = pcall(function ()
        return sml.get_ctrl_sas_phy_err(ctrl_phy_err.i_controller_index, ctrl_phy_err.i_force_update)
    end)

    if not ok then
        return -1
    end

    ctrl_phy_err.o_phy_count = #phy_arr + 1 -- #计算从下标1开始

    if not ctrl_phy_err.o_phy then
        ctrl_phy_err.o_phy = {}
    end

    for i = 0, ctrl_phy_err.o_phy_count - 1 do
        ctrl_phy_err.o_phy[i] = {}
        ctrl_phy_err.o_phy[i].phy_id = i
        ctrl_phy_err.o_phy[i].invalid_dword_count = phy_arr[i].invalid_dword_count
        ctrl_phy_err.o_phy[i].loss_dword_sync_count = phy_arr[i].loss_dword_sync_count
        ctrl_phy_err.o_phy[i].phy_reset_problem_count = phy_arr[i].phy_reset_problem_count
        ctrl_phy_err.o_phy[i].running_disparity_error_count = phy_arr[i].running_disparity_error_count
    end

    return 0
end

-- 实时获取RAID卡的PHY错误情况
function c_phy_biterr:phy_biterr_get_from_oob(diag_info)
    local ctrl_phy_err = {}
    ctrl_phy_err.i_controller_index = diag_info.obj_index
    ctrl_phy_err.i_force_update = 1

    local ret = self:get_ctrl_sas_phy_err_count(ctrl_phy_err)
    if ret == 0 then
        if ctrl_phy_err.o_phy_count > SML_MAX_SAS_PHY_PER_CTRL then
            ctrl_phy_err.o_phy_count = SML_MAX_SAS_PHY_PER_CTRL
        end

        -- PHY个数限制不能超过了RAID卡配置文件中的个数(3416IT有17个，其中最后一个PHY16为虚拟PHY，需要过滤掉)
        if (diag_info.max_raid_phy_count ~= 0) and (ctrl_phy_err.o_phy_count > diag_info.max_raid_phy_count) then
            ctrl_phy_err.o_phy_count = diag_info.max_raid_phy_count
        end

        for i = 0, ctrl_phy_err.o_phy_count - 1 do
            diag_info.raid_phy.err_cnt_info[i] = ctrl_phy_err.o_phy[i]
        end

        diag_info.raid_phy.phy_count = ctrl_phy_err.o_phy_count
    else
        local controller = c_controller_service.get_instance():get_by_controller_id(diag_info.obj_index)
        if controller then
            log:error("Get RAID Controller %s PHY error info failed, ctrl id = %d, return %s",
                controller.DeviceName, controller.Id, ret)
        end
    end

    return ret
end

-- 检查raid故障诊断时间是否到期
function c_phy_biterr:check_phy_biterr_diagnose_startup()
    local last = self.phy_biterr_last_diag_time
    local current = math.floor(os.time())
    local interval = self.config_obj.PhyErrorInterval

    -- last = 0表示第一次启动诊断任务
    if last == 0 then
        return true
    end

    if ((current > last) and (current - last) > interval) or
        ((last > current) and (common_def.INVALID_U32 - last + current) > interval) then
        return true
    end

    return false
end

-- 更新PHY误码获取失败计数
function c_phy_biterr:phy_biterr_get_failed_cnt(p_phyerr)
    if not p_phyerr then
        return
    end

    -- 已经获取过有效数据的情况下才会统计获得误码失败的次数
    if p_phyerr.raid_phy.base_data_obtained then
        p_phyerr.raid_phy.failed_cnt = p_phyerr.raid_phy.failed_cnt + 1
    else
        p_phyerr.raid_phy.failed_cnt = 0
    end
end


function c_phy_biterr:mock(ctrl_id, expander_id, file_path)
    local raid_diag = nil
    for _, v in pairs(self.diagnose_list) do
        if v.obj_index == ctrl_id then
            raid_diag = v
        end
    end

    if not raid_diag then
        log:notice('controller %s not exis', ctrl_id)
        return -1
    end

    raid_diag.raid_mock_mgnt_info.mock_enabled = 1
    raid_diag.raid_mock_mgnt_info.record_index = 0
    raid_diag.raid_mock_mgnt_info.file_name = file_path
    raid_diag.raid_mock_mgnt_info.mock_first_time = true
    c_storageconfig.get_instance().obj.FileMockEnabled = true
    return 0
end

return singleton(c_phy_biterr)