-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at:
--         http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.

local signal = require 'mc.signal'
local utils = require 'mc.utils'
local log = require 'mc.logging'
local initiator = require 'mc.initiator'
local debounce = require 'mc.debounce.debounce'

local sml = require 'sml'
local c_object = require 'object_manager.object'
local c_tasks = require 'tasks'
local common_def = require 'common_def'
local ctrl_commu_loss_monitor = require 'ctrl_commu_loss_monitor'
local c_storageconfig = require 'storageconfig.storageconfig_object'
local json = require 'cjson'
local method_misc = require 'method_misc'
local skynet = require 'skynet'
local error_engine = require 'error_engine'
local c_drives_object = require 'drives.drives_object'
local add_event = require 'add_event'
local SML_ERR_CODE_E <const> = error_engine.SML_ERR_CODE_E
local smart_def = common_def.PD_SMART_DEF

local SMART_TEST_TYPE = {
    SATA_OFFLINE = 0,
    SATA_SHORT_OFFLINE = 1,
    SATA_EXTENDED_OFFLINE = 2,
    SAS_DEFAULT = 100,
    SAS_BACKGROUND_SHORT = 101,
    SAS_BACKGROUND_LONG = 102
}

local TASK_PRESENCE <const> = 'update_presence'
local TASK_UPDATE <const> = 'update'
local TASK_NVME <const> = 'update_nvme'
local TASK_UPDATE_SMART <const> = 'update_smart'
local TASK_UPDATE_STATUS <const> = 'update_status' -- 该任务主要用于监控来自CPLD的状态
local TASK_DUMP_LOG <const> = 'dump_log'
local TASK_DUMP_LOG_PRECHECK = 'dump_log_precheck'
local TASK_DUMP_LOG_INTERVAL = 30 * 60 * 1000 -- 检测日志收集触发条件的时间间隔设为30分钟
local TASK_DUMP_LOG_PRECHECK_INTERVAL = 5 * 60 * 1000 -- 预检测日志收集触发条件的时间间隔设为5分钟
local UPDATE_INTERVAL_OS_24_HOURS = (3600 * 24) -- 24小时更新一次冗余记录

---@class c_drive: c_object
---@field Id integer
---@field SlotNumber integer
---@field Name string
---@field Presence integer
---@field EnclosureId integer
---@field LocationIndicatorState integer
---@field CapacityMiB integer
---@field TemperatureCelsius integer
---@field Model string
---@field ObjectName string
---@field LocateLed integer
---@field FaultLed integer
---@field NodeId string
---@field PhysicalLocation string
---@field RelativeSlot integer
---@field HddBackplaneStartSlot integer
---@field RefControllerId integer
---@field RefVolumeList integer[]
---@field RefDiskArrayId integer
---@field on_presence_changed c_basic_signal
---@field on_identified_changed c_basic_signal
---@field on_pd_update c_basic_signal
---@field dump_interval integer
---@field on_dump_log c_basic_signal
---@field on_update_diagnose_info c_basic_signal
---@field on_update_io_diagnose_info c_basic_signal
---@field new fun(...): c_drive
local c_drive = c_object('Drive', { 'ObjectName' }) -- Drive 类，主键是 ObjectName 字段

local basic_diagnose_info = {
    smart_item = '',
    error_log_count = 0,
    error_log = '',
    selftest_failure_count = 0,
    selftest_log = '',
    phy_event_icrc_count = 0,
    glist_count = 0,
    glist = '',
    plist_count = 0,
    plist = '',
    critical_event_count = 0,
    critical_event_log = '',
    error_count_log = '',
    temperature_log = '',
    informational_exceptions_log = '',
    background_medium_scan_count = 0,
    background_medium_scan_log = '',
    protocol_specific_port_log = '',
    solid_state_media_log = '',
    general_statistics_log = '',
    vendor_specific_log = '',
    asc = '',
    ascq = ''
}

local basic_io_diagnose_info = {
    drive_name = '',
    media_type = '',
    manufacturer = '',
    serial_number = '',
    protocol = common_def.INVALID_U8,
    ref_controller_id = common_def.INVALID_U8,
    glist_cnt = 0,
    plist_cnt = 0,
    unc_static = 0,
    media_error = 0,
    other_error = 0,
    cmd_timeout = 0,
    bit_error_rate_zone = '',
    flyheight_clearance_delta_outer = '',
    flyheight_clearance_delta_inner = '',
    flyheight_clearance_delta_middle = '',
    cur_farm = '',
    factory_farm = ''
}

local basic_estimated_remaining_lifespan_info = {
    drive_name = '',
    media_type = '',
    manufacturer = '',
    serial_number = '',
    protocol = common_def.INVALID_U8,
    ref_controller_id = common_def.INVALID_U8,
    is_support_hw_defined = 0,
    slc_avg_ec = common_def.INVALID_U32,
    tlc_avg_ec = common_def.INVALID_U32,
    slc_poh = common_def.INVALID_U32,
    tlc_poh = common_def.INVALID_U32,
    slc_pe_cycle = common_def.INVALID_U32,
    tlc_pe_cycle = common_def.INVALID_U32,
    remn_wearout = common_def.INVALID_U8,
    power_on_hours = common_def.INVALID_U32,
    tlc_used_lifespan = common_def.INVALID_NUM2STR,
    slc_used_lifespan = common_def.INVALID_NUM2STR
}

local write_amplification_info = {
    update_support_flag = 0,
    hw_defined_valid_flag = 0,
    hw_defined_nand_write_l = common_def.STORAGE_INFO_INVALID_DWORD,
    hw_defined_nand_write_h = common_def.STORAGE_INFO_INVALID_DWORD,
    hw_defined_host_write_l = common_def.STORAGE_INFO_INVALID_DWORD,
    hw_defined_host_write_h = common_def.STORAGE_INFO_INVALID_DWORD,
    vendor_valid_flag = 0,
    vendor_nand_write = common_def.STORAGE_INFO_INVALID_DWORD,
    vendor_host_write = common_def.STORAGE_INFO_INVALID_DWORD,
    last_nand_written = common_def.STORAGE_INFO_INVALID_DWORD,
    last_host_written = common_def.STORAGE_INFO_INVALID_DWORD,
    last_serial_number = 'N/A',
    first_start_flag = 1,
}

function c_drive:ctor()
    self.device_id = 0xffff
    self.presence = 0 -- 在位状态
    self.last_presence = 255 -- 上一次在位状态
    self.on_presence_changed = signal.new() -- 在位状态变化信号
    self.on_identified_changed = signal.new() -- 定位状态变化信号
    self.on_pd_update = signal.new() -- pd 更新信号
    self.on_pd_update_smart = signal.new()
    self.on_update_volume_list = signal.new()
    self.on_dump_log = signal.new()
    self.on_update_diagnose_info = signal.new()
    self.on_update_diagnose_metric = signal.new()
    self.on_update_io_diagnose_info = signal.new()
    self.cont_bin = debounce[debounce.DEBOUNCED_CONT_BIN].new(5, 5, 0)
    self.identify_pd = false
    self.identify_fail_count = 0 -- 点灯识别失败次数
    self.in_offline_array = false   -- 是否属于offline状态的逻辑盘
    self.in_degraded_array = false   -- 是否属于degraded状态的逻辑盘
    self.in_failed_array = false    -- 是否属于failed状态的逻辑盘
    self.miss_by_oob = false -- 通过带外功能检测出missing
    self.miss_by_bma = false -- 通过bma检测出missing
    self.fault_by_oob = false  -- 通过带外功能诊断出故障
    self.fault_by_cpld = false  -- 通过cpld检测出故障
    self.dump_info = {
        last_dump_time = 0, -- 上一次的日志收集时间
        dump_interval = 24, -- 默认日志收集间隔为24小时
        first_start_time_in_hour = 255 -- 默认首次启动收集
    }
    self.RefControllerTypeId = common_def.INVALID_U8
    self.HealthCode = common_def.INVALID_U16
    self.diagnose_info = utils.table_copy(basic_diagnose_info)
    self.io_diagnose_info = utils.table_copy(basic_io_diagnose_info)
    self.estimated_remaining_lifespan_info = utils.table_copy(basic_estimated_remaining_lifespan_info)
    self.write_amplification_info = utils.table_copy(write_amplification_info)
    self.est_lifespan_last_up_time = 0
    self.spare_block_last_check_time = 0
    self.last_record_spare_block = 0
    self.is_protocol_valid = false
end

function c_drive:dtor()
    self:stop_tasks()
end

function c_drive:update_drive_slot_info()
    self.Id = self.HddBackplaneStartSlot + self.RelativeSlot
    self.Name = string.format('Disk%d', self.Id)
    self.NodeId = string.format('HDDPlaneDisk%d', self.Id)
end

function c_drive:log_presence()
    if (self.Presence == 0 or self.Presence == 1) and self.last_presence ~= 255 then
        log:operation(initiator.new('NA', 'NA', '127.0.0.1'), 'storage',
            '%s plug %s successfully', self.Name, self.Presence == 0 and 'out' or 'in')
    end
    self.last_presence = self.Presence
end

function c_drive:generate_fw_status_error(fault_assert)
    if self.FirmwareStatusError ~= fault_assert then
        log:notice('Disk%s FirmwareStatusError will change to %s, FaultLed %s',
            self.Id, fault_assert, fault_assert == true and 'ON' or 'OFF')
        self.FirmwareStatusError = fault_assert
    end
end

function c_drive:monitor_fault_state()
    -- 启动硬件状态更新任务
    local fault, fault_assert
    self:new_task({ TASK_UPDATE_STATUS, self.ObjectName }):loop(function(task)
        if c_storageconfig.get_instance().all_ctrl_loaded then
            fault = 0
            if self.Presence == 1 and self.LocateLed == 0 then
                fault = self.cont_bin:get_debounced_val(self.FaultLed)
            end
            fault_assert = (fault == 1 and true or false)
            if c_storageconfig.get_instance():support_oob() then
                self:generate_fw_status_error(fault_assert)
            else
                self:generate_fault(fault_assert, 'CPLD')
            end
        else
            self.FirmwareStatusError = false
        end
    end):set_timeout_ms(1000)
end

-- 更新重构状态
function c_drive:update_rebuild_state_and_progress(pd_info)

    if pd_info.fw_state == common_def.FW_SYSTEM or
        pd_info.fw_state == common_def.PD_STATE.EPD then
        self.RebuildState = 0
        self.RebuildProgress = 0
        return
    end
 
    -- 支持获取物理盘的重构状态
    if self.RebuildState ~= pd_info.proginfo.rebuild_state then
        self.RebuildState = pd_info.proginfo.rebuild_state
    end
 
    -- 支持获取物理盘的重构进度
    self.RebuildProgress = pd_info.proginfo.rebuild_progress
end

function c_drive:diagnose_info_update_proc()
    self.on_update_diagnose_info:on(function(diagnose_info)
        self:update_diagnose_info(diagnose_info)
    end)

    self.on_update_io_diagnose_info:on(function(io_diagnose_info)
        self:update_io_diagnose_info(io_diagnose_info)
    end)
end

function c_drive:init()
    -- 更新硬盘定位灯状态
    self:set_default_values()
    self.RefVolumeList = {}
    self.Location = self.PhysicalLocation
    self:update_location_indicator_state()
    self.on_property_changed:on(function(key, _)
        if key == 'LocateLed' or key == 'FaultLed' then
            self:update_location_indicator_state()
        end
    end)

    -- 对下面监听在位监听属性变化的补充，如果在位在监听之前已变化，需更新last_presence值
    if self.Presence ~= self.last_presence then
        self.last_presence = self.Presence
    end

    -- 记录硬盘插拔日志
    self.on_property_changed:on(function(key, _)
        if key == 'Presence' then
            self:log_presence()
        end
    end)

    -- 只有定位成功才启动更新任务
    self.on_identified_changed:on(function(is_identified)
        if is_identified then
            self.Missing = 0
            self:start_update_tasks()
        else
            self:stop_update_tasks()
        end
    end)

    -- 启动在位状态更新任务
    self:new_task({ TASK_PRESENCE, self.ObjectName }):loop(function()
        if self.base_obj.HddBackplaneStartSlot and
           self.HddBackplaneStartSlot ~= common_def.INVALID_U8 then
            self:update_drive_slot_info()
        end
        self:update_presence()
    end)

    self:monitor_fault_state()

    self.on_pd_update:on(function(pd_info)
        self:update_static_drive_info(pd_info)
        self:update_raid_drive_info(pd_info)
        self:check_in_failed_array()
        self:update_asset_data_info()
        self:update_subhealth_info(pd_info)
    end)

    self.on_pd_update_smart:on(function(pd_info)
        self:update_drive_sas_smart_info(pd_info)
    end)

    self.on_dump_log:on(function()
        self:dump_drive_log()
        self:record_write_amp_log()
    end)

    self:diagnose_info_update_proc()
end

function c_drive:update_drive_static_info_by_sepecific_obj(obj)
    if not obj then
        return
    end

    local info = obj:update_drive_info()
    if not info then
        return
    end

    self.Model = info.Model
    self.SerialNumber = info.SerialNumber
    self.Manufacturer = info.Manufacturer
    self.ManufacturerId = info.ManufacturerId
    self.CapableSpeedGbs = info.CapableSpeedGbs
    self.NegotiatedSpeedGbs = info.NegotiatedSpeedGbs
    self.CapacityMiB = info.CapacityMiB
    self.MediaType = info.MediaType
    self.Protocol = info.Protocol
    self.ResourceId = info.ResouceId
    self.Status = info.Status
    self.LifeUsedPercentage = info.LifeUsedPercentage
end

function c_drive:update_nvme_smart_log_info(obj)
    if not obj then
        return
    end

    local info = obj:get_nvme_smart_info()
    if not info then
        return
    end

    self.AvailableSpare = info.avail_spare
    self.TLCSpareBlockPercentage = info.avail_spare
    self.CriticalWarning = info.critical_warning
    self.UsedPercentage = info.percent_used
    self.PredictedMediaLifeLeftPercent = (info.percent_used > 100) and 0 or (100 - info.percent_used)
    self.PowerOnHours = info.power_on_hours
end

function c_drive:update_subhealth_info_from_mctp()
    self.estimated_remaining_lifespan_info.drive_name = self.Name
    self.estimated_remaining_lifespan_info.media_type = common_def.MEDIA_TYPE_STR[1] -- NVMe固定为SSD
    self.estimated_remaining_lifespan_info.manufacturer = self.Manufacturer
    self.estimated_remaining_lifespan_info.serial_number = self.SerialNumber
    self.estimated_remaining_lifespan_info.protocol = self.Protocol
    self.estimated_remaining_lifespan_info.remn_wearout = self.PredictedMediaLifeLeftPercent
    self.estimated_remaining_lifespan_info.power_on_hours = self.PowerOnHours
end

function c_drive:set_link_fault(link_fault)
    self.LinkFault = link_fault
    local db = c_storageconfig.get_instance().db
    local drive_info = db:select(db.Drive):where(db.Drive.Id:eq(self.Id)):first()
    if drive_info and drive_info.Id then
        drive_info.LinkFault = self.LinkFault
        drive_info:save()
        log:notice('update Drive %s LinkFault to %s', self.Id, self.LinkFault)
    else
        db:insert(db.Drive):value({
            Id = self.Id,
            LinkFault = self.LinkFault
        }):exec()
        log:notice('insert Drive %s LinkFault %s', self.Id, self.LinkFault)
    end
end

function c_drive:update_dynamic_info(obj)
    if not obj then
        return
    end

    local info = obj:update_dynamic_drive_info()
    if not info then
        return
    end

    self.TemperatureCelsius = info.TemperatureCelsius
    self.PredictiveFailure = info.PredictiveFailure
    self.PredictedMediaLifeLeftPercent = info.PredictedMediaLifeLeftPercent
    self.Failure = (info.Failure == 1) and true or false
    self.PowerOnHours = info.PowerOnHours
    self.MediaErrorCount = info.MediaErrorCount
    self.Revision = info.Revision
    self.Failure = (info.Failure == 1) and true or false
    -- 判断是否为装备模式
    local ok, _ = pcall(require, 'storage_manufacture_app')
    if not ok then
        log:info('[storage]check manufacture mode is false.')
        if (self.Failure and 1 or 0) ~= self.FaultLed then
            self:set_fault_led_by_smc(self.Failure and 1 or 0)
        end
    end

    if self.LinkFault ~= info.link_fault then
        add_event.get_instance().generate_nvme_link_fault(info.link_fault and 'true' or 'false', self.Name)
        self:set_link_fault(info.link_fault)
    end
    self.Status = info.Status
    self.LifeUsedPercentage = info.LifeUsedPercentage
end

function c_drive:nvme_identified(nvme)
    if nvme then
        log:notice('Start update NVME info,ObjectName:%s identify_pd:%s.', self.ObjectName, self.identify_pd)
        self:new_task({ TASK_NVME, self.ObjectName }):loop(function()
            -- 存在raid卡优先从raid卡获取
            if self.identify_pd then
                return
            end

            self:update_drive_static_info_by_sepecific_obj(nvme)
            self:update_dynamic_info(nvme)
            self:update_asset_data_info()
            nvme:init_nvme_mi_mctp()
            self:update_nvme_smart_log_info(nvme)
            self:update_subhealth_info_from_mctp()
        end):set_timeout_ms(10000)
    else
        self.SerialNumber = common_def.INVALID_STRING
        self:stop()
    end
end

function c_drive:stop()
    self:stop_tasks()
    self:set_default_values()

    -- 停止后需要补发一次不在位信号
    self:set_presence(0)
end

function c_drive:is_presence()
    return self.presence ~= 0
end

function c_drive:clear_asset_data_info()
    self.AssetType = ""
    self.AssetName = ""
    self.InventorySerialNumber = ""
    self.InventoryFirmwareVersion = ""
    self.PCBVersion = ""
    self.InventoryManufacturer = ""
    self.AssetTag = ""
    self.PartNumber = ""
    self.ManufactureDate = ""
    self.Slot = ""
end

-- 将值设为初始值
function c_drive:set_default_values()
    -- 对于复位持久化的数据，不设置默认值。
    self.TemperatureCelsius = common_def.INVALID_U8
    self.FirmwareStatus = common_def.INVALID_U8
    self.PowerOnHours = common_def.INVALID_U32
    self.RotationSpeedRPM = common_def.INVALID_U16
    self.BlockSizeBytes = common_def.INVALID_U16
    self.SASAddress1, self.SASAddress2 = common_def.INVALID_STRING, common_def.INVALID_STRING
    self.PredictedMediaLifeLeftPercent = common_def.INVALID_U8
    self.PredictedFailCount = common_def.INVALID_U32
    self.MediaErrorCount = common_def.INVALID_U32
    self.OtherErrorCount = common_def.INVALID_U32
    self.PatrolState = common_def.INVALID_U8

    self.RebuildProgress = common_def.INVALID_U8
    self.PowerState = common_def.INVALID_U8
    self.BootPriority = common_def.INVALID_U8
    self.HotspareType = common_def.INVALID_U8
    self.RefDiskArrayId = common_def.INVALID_U16
    self.Missing = 0
    self.PredictiveFailure = 0
    self.LastPrefailEventSeqNum = common_def.INVALID_U32
    self.StripTemperatureCelsius = common_def.INVALID_U32
    self.ElementsInGrownDefectList = common_def.INVALID_U32
    self.ElementsInPrimaryDefectList = common_def.INVALID_U32
    self.ManufacturedInWeekOfYear = common_def.INVALID_STRING
    self.BlocksSentToInitiator = common_def.INVALID_U32
    self.BlocksReceivedFromInitiator = common_def.INVALID_U32
    self.UntilNextInterSMARTTestMinutes = common_def.INVALID_U32
    self.Failure = false
    self.AvailableSpare = common_def.INVALID_U8
    self.UsedPercentage = common_def.INVALID_U8
    self.CriticalWarning = common_def.INVALID_U8
    self.is_protocol_valid = false
    self:clear_asset_data_info()
    self.Status = common_def.INVALID_U8
    self.LifeUsedPercentage = common_def.INVALID_U8
end

-- 判断硬盘是否正在被点亮
function c_drive:is_being_located()
    return self.LocateLed == 1 and self.FaultLed == 0
end

-- 更新硬盘的定位灯状态
function c_drive:update_location_indicator_state()
    if self:is_being_located() then
        self.LocationIndicatorState = 1
    else
        self.LocationIndicatorState = 0
    end
end

-- 判断硬盘是否定位成功
function c_drive:is_identified()
    return self.identify_pd ~= false
end

-- 定位硬盘与 pd 的关系，pd 为 nil 意味着解除定位
function c_drive:identified(pd)
    if not pd then
        if self:is_identified() then
            log:notice('drive unidentify, pd = %s, drive = %s', self.identify_pd.key, self.Name)
            self.identify_pd = false
        end

        self.device_id = 0xffff
        self.on_identified_changed:emit(false)
    else
        self.RefControllerId = pd.controller_id
        self.SlotNumber = pd.slot_num
        self.EnclosureId = pd.enclosure_id
        self.device_id = pd.device_id

        self.identify_pd = pd
        pd.ref_drive = self
        log:notice('drive identify, pd = %s, drive = %s', pd.key, self.Name)
        self.on_identified_changed:emit(true)
    end
end

-- 更新在位状态
-- 因为不确定自发现在重启或热插拔时是否能正确的补发在位状态变化通知，
-- 所以这里我们自己缓存了在位状态并确保发送在位变化信号
function c_drive:set_presence(presence)
    if self.presence ~= presence then
        self.presence = presence
        self.on_presence_changed:emit(presence ~= 0)
    end
end

-- 定时更新在位状态
function c_drive:update_presence()
    self:set_presence(self.Presence)
    if self.Presence ==0 then
        self.identify_fail_count = 0
    end
end

-- 收集Disk的资产清单
function c_drive:update_asset_data_info()
    self.AssetType = 'Disk'
    self.AssetName = self.Name
    self.InventorySerialNumber = self.SerialNumber
    self.InventoryFirmwareVersion = self.Revision
    self.PCBVersion = 'N/A'
    self.InventoryManufacturer = self.Manufacturer
    self.AssetTag = 'N/A'
    self.PartNumber = 'N/A'
    self.ManufactureDate = 'N/A'
    self.Slot = tostring(self.Id)
end

local function check_manufacture_statisfied_top3(manufacturer)
    if manufacturer == smart_def.PD_VENDOR_NAME_INTEL or manufacturer == smart_def.PD_VENDOR_NAME_HUAWEI or
        manufacturer == smart_def.PD_VENDOR_NAME_SAMSUNG then
        return true
    end
    return false
end

-- 收集Disk的亚健康信息
function c_drive:update_subhealth_info(pd_info)
    self.estimated_remaining_lifespan_info.drive_name = self.Name
    self.estimated_remaining_lifespan_info.media_type = common_def.MEDIA_TYPE_STR[pd_info.media_type]
    self.estimated_remaining_lifespan_info.manufacturer = pd_info.manufacturer
    self.estimated_remaining_lifespan_info.serial_number = pd_info.serial_num
    self.estimated_remaining_lifespan_info.protocol = pd_info.interface_type
    self.estimated_remaining_lifespan_info.ref_controller_id = self.RefControllerId
    local hw_defined_estimated_lifespan = pd_info.hw_defined_estimated_lifespan
    if hw_defined_estimated_lifespan and hw_defined_estimated_lifespan.hw_valid_flag ~= 0 then
        self.estimated_remaining_lifespan_info.is_support_hw_defined = hw_defined_estimated_lifespan.hw_valid_flag
        self.estimated_remaining_lifespan_info.slc_avg_ec = hw_defined_estimated_lifespan.slc_avg_ec
        self.estimated_remaining_lifespan_info.tlc_avg_ec = hw_defined_estimated_lifespan.tlc_avg_ec
        self.estimated_remaining_lifespan_info.slc_poh = hw_defined_estimated_lifespan.slc_poh
        self.estimated_remaining_lifespan_info.tlc_poh = hw_defined_estimated_lifespan.tlc_poh
        self.estimated_remaining_lifespan_info.slc_pe_cycle = hw_defined_estimated_lifespan.slc_pe_cycle
        self.estimated_remaining_lifespan_info.tlc_pe_cycle = hw_defined_estimated_lifespan.tlc_pe_cycle
        self.estimated_remaining_lifespan_info.slc_used_lifespan = hw_defined_estimated_lifespan.slc_used_lifespan
        self.estimated_remaining_lifespan_info.tlc_used_lifespan = hw_defined_estimated_lifespan.tlc_used_lifespan
    end

    local vendor_estimated_lifespan = pd_info.vendor_estimated_lifespan
    if vendor_estimated_lifespan and vendor_estimated_lifespan.vendor_valid_flag ~= 0 then
        self.estimated_remaining_lifespan_info.is_support_hw_defined = 0
        self.estimated_remaining_lifespan_info.remn_wearout = vendor_estimated_lifespan.remn_wearout
        self.estimated_remaining_lifespan_info.power_on_hours = vendor_estimated_lifespan.power_on_hours
    end

    if self.Protocol == common_def.DRIVE_PROTOCOL.SAS and common_def.MEDIA_TYPE_STR[self.MediaType] == 'SSD' and
        check_manufacture_statisfied_top3(self.Manufacturer) then
        self.estimated_remaining_lifespan_info.remn_wearout = self.PredictedMediaLifeLeftPercent
        self.estimated_remaining_lifespan_info.power_on_hours = self.PowerOnHours
    end
end

-- 更新硬盘静态对象信息
---@param pd_info c_pd_info
function c_drive:update_static_drive_info(pd_info)
    self.Model = pd_info.model

    -- 支持获取物理盘的厂商信息
    self.Manufacturer = pd_info.manufacturer

    -- 支持获取物理盘的固件版本
    self.Revision = pd_info.firmware_version

    -- 支持获取物理盘的接口类型
    self.Protocol = pd_info.interface_type

    -- 支持获取物理盘的介质类型
    self.MediaType = pd_info.media_type

    -- 支持获取物理盘的序列号
    self.SerialNumber = pd_info.serial_num

    -- 协议是否有效
    self.is_protocol_valid = true
end

function c_drive:update_raid_drive_firmware_status(fw_state)
    if self.FirmwareStatus == fw_state then
        return
    end
    self.FirmwareStatus = fw_state
    if self:is_abnormal() then
        skynet.fork(function()
            self:diagnose_task()
        end)
    else
        self:generate_fault(false, 'OOB')
    end
end

local function record_log_check(last_time, cur_time)
    if last_time == 0 then
        return true
    end

    if ((cur_time > last_time) and (cur_time - last_time) > UPDATE_INTERVAL_OS_24_HOURS) or
        ((last_time > cur_time) and (common_def.INVALID_U32 - last_time + cur_time) > UPDATE_INTERVAL_OS_24_HOURS) then
        return true
    end
    return false
end

local function spare_black_to_maintaince_log(name, cur_tlc, cur_slc, threshold)
    log:maintenance(log.MLOG_INFO, common_def.FC__PUBLIC_OK,
        "The percent of valid spare blocks of %s(User Area: %s Non-User Area: %s) is lower than the threshold(%s%%).",
        name, cur_tlc, cur_slc, threshold)
end

local function check_percent_satisfied_thre(self, threshold, spareblock)
    -- 已经记录过，不再记录
    if self.last_record_spare_block == threshold then
        return false
    end

    if self.SLCSpareBlockPercentage ~= common_def.INVALID_U8 and spareblock.slc_value ~= common_def.INVALID_U8 then
        if self.SLCSpareBlockPercentage > threshold and spareblock.slc_value <= threshold then
            return true
        end
    end

    if self.TLCSpareBlockPercentage ~= common_def.INVALID_U8 and spareblock.tlc_value ~= common_def.INVALID_U8 then
        if self.TLCSpareBlockPercentage > threshold and spareblock.tlc_value <= threshold then
            return true
        end
    end

    return false
end

local function record_spare_black_to_maintaince_log(self, spareblock)
    local record_flag = false
    if check_percent_satisfied_thre(self, smart_def.PERCENT_30, spareblock) then
        self.last_record_spare_block = smart_def.PERCENT_30
        record_flag = true
    elseif check_percent_satisfied_thre(self, smart_def.PERCENT_50, spareblock) then
        self.last_record_spare_block = smart_def.PERCENT_50
        record_flag = true
    elseif check_percent_satisfied_thre(self, smart_def.PERCENT_70, spareblock) then
        self.last_record_spare_block = smart_def.PERCENT_70
        record_flag = true
    end

    if not record_flag then
        return
    end

    local cur_slc = spareblock.slc_value == common_def.INVALID_U8 and common_def.INVALID_STRING or spareblock.slc_value
    local cur_tlc = spareblock.tlc_value == common_def.INVALID_U8 and common_def.INVALID_STRING or spareblock.tlc_value
    -- 当冗余块比例达到70%/50%/30%时记录日志
    spare_black_to_maintaince_log(self.Name, cur_tlc, cur_slc, self.last_record_spare_block)
end

function c_drive:record_maintaince_log_for_spare_black(spareblock, need_check)
    -- 需要用户区或非用户区的冗余信息至少一个为有效值，否则不记录
    if spareblock.slc_value == common_def.INVALID_U8 and spareblock.tlc_value == common_def.INVALID_U8 then
        return
    end
    -- 此次记录前硬盘的冗余信息为无效值，不记录
    if self.SLCSpareBlockPercentage == common_def.INVALID_U8 and
        self.TLCSpareBlockPercentage == common_def.INVALID_U8 then
        log:info('get the Disk%s spare block info invalid', self.Id)
        return
    end
    -- 24小时记录一次
    local cur_time = math.floor(os.time())
    if need_check and not record_log_check(self.spare_block_last_check_time, cur_time) then
        return
    end

    self.spare_block_last_check_time = cur_time
    record_spare_black_to_maintaince_log(self, spareblock)
end

-- 更新硬盘对象信息
---@param pd_info c_pd_info
function c_drive:update_raid_drive_info(pd_info)
    self.TemperatureCelsius = pd_info.temperature
    self.CapacityMiB = pd_info.coerced_size

    -- 支持获取物理盘的接口速率（最大速率）
    self.CapableSpeedGbs = pd_info.device_speed

    -- 支持获取物理盘的连接速率（协商速率）
    self.NegotiatedSpeedGbs = pd_info.link_speed

    -- 支持获取物理盘的固件状态
    self:update_raid_drive_firmware_status(pd_info.fw_state)

    -- 支持获取物理盘的通电时间
    self.PowerOnHours = pd_info.power_on_hours

    -- 支持获取物理盘的转速
    self.RotationSpeedRPM = pd_info.rotation_speed

    -- 支持获取物理盘的块大小
    self.BlockSizeBytes = pd_info.block_size

    -- 支持获取物理盘的SAS地址
    self.SASAddress1, self.SASAddress2 = pd_info.sas_addr1, pd_info.sas_addr2

    -- 支持获取物理盘的剩余磨损率
    self.PredictedMediaLifeLeftPercent = pd_info.remnant_media_wearout

    -- 支持获取物理盘的Predictive failure
    self.PredictedFailCount = pd_info.prefail_count

    -- 支持获取物理盘的媒介错误统计
    self.MediaErrorCount = pd_info.media_err_count

    -- 支持获取物理盘的其他错误统计
    self.OtherErrorCount = pd_info.other_err_count

    local proginfo = pd_info.proginfo
    if proginfo then
        -- 支持获取物理盘的巡检状态
        self.PatrolState = proginfo.patrol_state
        self:update_rebuild_state_and_progress(pd_info)
    end

    -- 支持获取物理盘的电源状态
    self.PowerState = pd_info.power_state

    -- 支持获取物理盘的启动优先级
    self.BootPriority = pd_info.boot_priority

    -- 支持获取物理盘的热备状态
    self.HotspareType = pd_info.hot_spare

    -- 支持解析物理盘的预告警状态
    self:update_prefail_state(pd_info)

    self.on_update_volume_list:emit(self)

    self.LastPrefailEventSeqNum = pd_info.last_prefail_event_seq_num

    local spareblock = pd_info.spareblock
    if spareblock then
        self:record_maintaince_log_for_spare_black(spareblock, true)
        self.SLCSpareBlockPercentage = spareblock.slc_value
        self.TLCSpareBlockPercentage = spareblock.tlc_value
    end

    -- 更新写放大相关数据
    self:update_write_amp_info(pd_info)
end

function c_drive:update_prefail_state(pd_info)
    self.PredictiveFailure = ((pd_info.prefail_count ~= 0 and pd_info.prefail_count ~= common_def.INVALID_U32) or
                              (pd_info.halflife == 1)) and 1 or 0
end

function c_drive:update_write_amp_info(pd_info)
    local writeamp = pd_info.writeamp
    local info = self.write_amplification_info
    if writeamp then
        info.update_support_flag = writeamp.update_support_flag
    end
    local hw_defined_write_amp = pd_info.hw_defined_write_amp
    if hw_defined_write_amp then
        info.hw_defined_valid_flag = hw_defined_write_amp.valid_flag
        info.hw_defined_nand_write_l = hw_defined_write_amp.nand_write_l
        info.hw_defined_nand_write_h = hw_defined_write_amp.nand_write_h
        info.hw_defined_host_write_l = hw_defined_write_amp.host_write_l
        info.hw_defined_host_write_h = hw_defined_write_amp.host_write_h
    end

    local vendor_write_amp = pd_info.vendor_write_amp
    if vendor_write_amp then
        info.vendor_valid_flag = vendor_write_amp.valid_flag
        info.vendor_nand_write = vendor_write_amp.nand_write
        info.vendor_host_write = vendor_write_amp.host_write
    end
end

local function phase_json_object(json_obj, prop_name, data_type)
    local obj = json_obj[prop_name]
    if obj ~= nil then
        if type(obj) == 'table'then
            log:notice('prop %s is %s', prop_name, json.encode(obj))
            return json.encode(obj)
        end
        log:notice('prop %s is %s', prop_name, json_obj[prop_name])
        return json_obj[prop_name]
    else
        log:error('prop %s is nil', prop_name)
    end
    return data_type == 0 and 0 or ''
end

local function get_asc_and_ascq_from_ie_log(json_obj)
    if json_obj == nil then
        return ''
    end
    local ie_log = json_obj['informational_exceptions_log']
    if ie_log == nil or type(ie_log) ~= 'table' or #ie_log == 0 then
        return ''
    end
    local asc = ''
    local ascq = ''
    for _, asc_arg in pairs(ie_log) do
        asc = string.gsub(asc_arg['asc'], '0x', '')
        asc = tostring(tonumber(asc, 16))
        ascq = string.gsub(asc_arg['ascq'], '0x', '')
        ascq = tostring(tonumber(ascq, 16))
    end
    log:notice('prop asc is %s', json.encode(asc))
    log:notice('prop ascq is %s', json.encode(ascq))
    return json.encode(asc), json.encode(ascq)
end

function c_drive:update_diagnose_info(diagnose_info)
    local ok, ret = pcall(sml.pd_get_smart_info_str, self.RefControllerId, self.device_id, self.Protocol)
    if not ok then
        log:error('Failed to get smart info, and return: %s', ret)
    else
        self.diagnose_info.smart_item = ret
    end

    local json_obj = json.decode(diagnose_info)
    self.diagnose_info.error_log_count = phase_json_object(json_obj, 'error_log_count', 0)
    self.diagnose_info.error_log = phase_json_object(json_obj, 'error_log', 1)

    if json_obj.selftest_log then
        self.diagnose_info.selftest_failure_count = #json_obj.selftest_log
    else
        self.diagnose_info.selftest_failure_count = 0
    end
    self.diagnose_info.selftest_log = phase_json_object(json_obj, 'selftest_log', 1)
    self.diagnose_info.phy_event_icrc_count = phase_json_object(json_obj, 'phy_event_icrc_count', 0)
    self.diagnose_info.glist_count = phase_json_object(json_obj, 'glist_count', 0)
    self.diagnose_info.glist = phase_json_object(json_obj, 'glist', 1)
    self.diagnose_info.plist_count = phase_json_object(json_obj, 'plist_count', 0)
    self.diagnose_info.plist = phase_json_object(json_obj, 'plist', 1)
    self.diagnose_info.critical_event_count = phase_json_object(json_obj, 'critical_event_count', 0)
    self.diagnose_info.critical_event_log = phase_json_object(json_obj, 'critical_event_log', 1)

    self.diagnose_info.error_count_log = phase_json_object(json_obj, 'error_count_log', 1)
    self.diagnose_info.temperature_log = phase_json_object(json_obj, 'temperature_log', 1)
    self.diagnose_info.informational_exceptions_log = phase_json_object(json_obj, 'informational_exceptions_log', 1)
    self.diagnose_info.background_medium_scan_count = phase_json_object(json_obj, 'background_medium_scan_count', 0)
    self.diagnose_info.background_medium_scan_log = phase_json_object(json_obj, 'background_medium_scan_log', 1)
    self.diagnose_info.protocol_specific_port_log = phase_json_object(json_obj, 'protocol_specific_port_log', 1)
    self.diagnose_info.asc, self.diagnose_info.ascq = get_asc_and_ascq_from_ie_log(json_obj)
    self.diagnose_info.solid_state_media_log = phase_json_object(json_obj, 'solid_state_media_log', 1)
    self.diagnose_info.general_statistics_log = phase_json_object(json_obj, 'general_statistics_log', 1)
    self.diagnose_info.vendor_specific_log = phase_json_object(json_obj, 'vendor_specific_log', 1)
end

function c_drive:update_io_diagnose_info(io_diagnose_info)
    local json_obj = json.decode(io_diagnose_info)
    self.io_diagnose_info.drive_name = phase_json_object(json_obj, 'drive_name', 1)
    self.io_diagnose_info.media_type = phase_json_object(json_obj, 'media_type', 1)
    self.io_diagnose_info.manufacturer = phase_json_object(json_obj, 'manufacturer', 1)
    self.io_diagnose_info.serial_number = phase_json_object(json_obj, 'serial_number', 1)
    self.io_diagnose_info.protocol = self.Protocol or common_def.INVALID_U8
    self.io_diagnose_info.ref_controller_id = self.RefControllerId or common_def.INVALID_U8
    self.io_diagnose_info.glist_cnt = phase_json_object(json_obj, 'glist_cnt', 0)
    self.io_diagnose_info.plist_cnt = phase_json_object(json_obj, 'plist_cnt', 0)
    self.io_diagnose_info.unc_static = phase_json_object(json_obj, 'unc_static', 0)
    self.io_diagnose_info.media_error = (self.MediaErrorCount == common_def.INVALID_U32 and 0 or self.MediaErrorCount)
    self.io_diagnose_info.other_error = (self.OtherErrorCount == common_def.INVALID_U32 and 0 or self.OtherErrorCount)
    self.io_diagnose_info.cmd_timeout = phase_json_object(json_obj, 'cmd_timeout', 0)
    self.io_diagnose_info.bit_error_rate_zone = phase_json_object(json_obj, 'ber', 1)
    self.io_diagnose_info.flyheight_clearance_delta_outer = phase_json_object(json_obj, 'fh_outer', 1)
    self.io_diagnose_info.flyheight_clearance_delta_inner = phase_json_object(json_obj, 'fh_inner', 1)
    self.io_diagnose_info.flyheight_clearance_delta_middle = phase_json_object(json_obj, 'fh_middle', 1)
    self.io_diagnose_info.cur_farm = phase_json_object(json_obj, 'cur_mr', 1)
    self.io_diagnose_info.factory_farm = phase_json_object(json_obj, 'fac_mr', 1)
end

function c_drive:generate_missing(assert, source)
    -- 当前触发的是link abnormal告警，不应该存在解除或触发missing告警的情况
    if self.Missing == 1 then
        return
    end

    local changed = false
    if source == 'OOB' then
        if self.miss_by_oob ~= assert then
            self.miss_by_oob = assert
            changed = true
        end
    elseif source == 'iBMA' then
        if self.miss_by_bma ~= assert then
            self.miss_by_bma = assert
            changed = true
        end
    end

    if changed then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_NO_IDENTIFY,
            "[%s] Physical drive %s is MISSING  - %s",
            source, self.Name, assert and 'Asserted' or 'Deasserted')
    end

    self.Missing = (self.miss_by_oob or self.miss_by_bma) and 2 or 0
end

-- 检查即将删除的pd是不是missing了
function c_drive:check_pd_missing(pd_count_changed, pd_list)
    if not pd_count_changed then
        return
    end

    if self.Presence ~= 1 then
        return
    end
    local enclosure_id_list = {}
    local slot_num_id = {}
    for _, pd in ipairs(pd_list) do
        enclosure_id_list[pd.enclosure_id] = true
        slot_num_id[string.format('E%d:S%d', pd.enclosure_id, pd.slot_num)] = true
    end
    if not enclosure_id_list[self.EnclosureId] then
        return
    end
    if slot_num_id[string.format('E%d:S%d', self.EnclosureId, self.SlotNumber)] then
        return
    end
    self.Missing = 1
end

function c_drive:check_link_abnormal()
    if self:is_identified() then
        self:generate_link_abnormal(false, 'OOB')
        self.identify_fail_count = 0
        return
    end

    if self.identify_fail_count < 60 then
        self.identify_fail_count = self.identify_fail_count + 1
        -- 达到阈值产生告警
        if self.identify_fail_count == 60 then
            self:generate_link_abnormal(true, 'OOB')
        end
    end
end

function c_drive:generate_link_abnormal(assert, source)
    -- 当前触发的是missing告警，不应该存在解除或触发link abnormal告警的情况
    if self.Missing == 2 then
        return
    end
    local value = assert and 1 or 0
    if self.Missing == value then
        return
    end

    log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_NO_IDENTIFY,
        "[%s] Physical drive %s can not be identified by locating LED  - %s",
        source, self.Name, assert and 'Asserted' or 'Deasserted')

    self.Missing = value
end

function c_drive:check_in_failed_array()
    -- 实际上只要在位就不满足告警条件了，加上状态判断是为了和V2一致
    if self.Presence == 1 or
        self.FirmwareStatus == common_def.PD_STATE.UNCONFIGURED_GOOD or
        self.FirmwareStatus == common_def.PD_STATE.ONLINE or
        self.FirmwareStatus == common_def.PD_STATE.READY or
        self.FirmwareStatus == common_def.PD_STATE.UNKNOWN then
        self:generate_in_failed_array(false, 'OOB', common_def.LD_STATE.OFFLINE)
        self:generate_in_failed_array(false, 'OOB', common_def.LD_STATE.DEGRADED)
        return
    end
end

function c_drive:generate_in_failed_array(assert, source, volume_state)
    local state_map = {
        [common_def.LD_STATE.OFFLINE] = {'OFFLINE', 'in_offline_array'},
        [common_def.LD_STATE.PARTIALLY_DEGRADED] = {'DEGRADED or PARTIALLY DEGRADED', 'in_degraded_array'},
        [common_def.LD_STATE.DEGRADED] = {'DEGRADED or PARTIALLY DEGRADED', 'in_degraded_array'},
        [common_def.LD_STATE.FAILED] = {'FAILED', 'in_failed_array'}
    }

    if assert == self[state_map[volume_state][2]] then
        return
    end

    self[state_map[volume_state][2]] = assert
    log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_LOGICAL_FAULT,
        "[%s] Logical drive which include Physical drive %s is %s - %s",
        source, self.Name, state_map[volume_state][1], assert and 'Asserted' or 'Deasserted')

    local value = (self.in_offline_array or self.in_degraded_array or self.in_failed_array) and 1 or 0
    self.InAFailedArray = value
end

-- 更新SAS硬盘SMART对象信息
---@param pd_sas_smart_info c_pd_sas_smart_info
function c_drive:update_drive_sas_smart_info(pd_sas_smart_info)
    self.StripTemperatureCelsius = pd_sas_smart_info.strip_temperature
    self.ElementsInGrownDefectList = pd_sas_smart_info.glist_len
    self.ElementsInPrimaryDefectList = pd_sas_smart_info.plist_len
    self.ManufacturedInWeekOfYear = pd_sas_smart_info.manufacture_data
    self.BlocksSentToInitiator = pd_sas_smart_info.blocks_sent
    self.BlocksReceivedFromInitiator = pd_sas_smart_info.blocks_received
    self.UntilNextInterSMARTTestMinutes = pd_sas_smart_info.minutes_left
end

-- 收集硬盘日志信息
function c_drive:dump_drive_log()
    if not self.Protocol or self.Protocol == common_def.INVALID_U8 then
        return
    end
    log:notice('Start dump log ==> Name: %s, Protocol: %s, Manufacturer: %s',
        self.Name, common_def.DRIVE_PROTOCOL_STR[self.Protocol], self.Manufacturer)
    local basic_info = {
        log_source = 0,
        log_rotate_num = 7,
        pd_power_state = self.PowerState or common_def.INVALID_U8,
        pd_interface_type = self.Protocol or common_def.INVALID_U8,
        pd_media_type = self.MediaType or common_def.INVALID_U8,
        pd_slot_number = self.SlotNumber or common_def.INVALID_U8,
        pd_remnant_media_wearout = self.PredictedMediaLifeLeftPercent or common_def.INVALID_U8,

        pd_enclosure_id = self.EnclosureId or common_def.INVALID_U16,
 
        pd_device_name = self.Name or 'N/A',
        pd_interface_type_str = common_def.DRIVE_PROTOCOL_STR[self.Protocol] or 'N/A',
        pd_manufacturer = self.Manufacturer or 'N/A',
        pd_serial_number = self.SerialNumber or 'N/A',
        pd_model = self.Model or 'N/A'
    }
    local ok, ret = pcall(sml.pd_log_get_drive_log, self.RefControllerId or common_def.INVALID_U8,
        self.Protocol or common_def.INVALID_U8, self.device_id or common_def.INVALID_U8, basic_info)
    if not ok then
        log:error('Failed to dump log, and return: %s', ret)
    else
        self.on_update_diagnose_info:emit(ret)
    end
    log:notice('Dump log finished ==> Name: %s, Protocol: %s', self.Name, common_def.DRIVE_PROTOCOL_STR[self.Protocol])
    self.on_update_diagnose_metric:emit(self.ObjectName)
end

-- 收集预估寿命和写放大日志信息
function c_drive:record_write_amp_log()
    if self.SerialNumber ~= self.write_amplification_info.last_serial_number then
        self.write_amplification_info.first_start_flag = 1
    end

    local info = self.write_amplification_info
    local basic_info = {
        update_support_flag = info.update_support_flag,
        vendor_valid_flag = info.vendor_valid_flag,
        vendor_nand_write = info.vendor_nand_write,
        vendor_host_write = info.vendor_host_write,
        hw_defined_valid_flag = info.hw_defined_valid_flag,
        hw_defined_nand_write_l = info.hw_defined_nand_write_l,
        hw_defined_nand_write_h = info.hw_defined_nand_write_h,
        hw_defined_host_write_l = info.hw_defined_host_write_l,
        hw_defined_host_write_h = info.hw_defined_host_write_h,
        last_nand_written = info.last_nand_written,
        last_host_written = info.last_host_written,
        first_start_flag = info.first_start_flag,
        pd_device_name = self.Name or 'N/A',
        pd_serial_number = self.SerialNumber or 'N/A',
    }
    local ok, ret = pcall(sml.pd_log_write_subhealthy_info, self.RefControllerId,
        self.EstimatedRemainingLifespan, basic_info)
    if not ok then
        log:error('Failed to record_write_amp, err: %s', ret)
        return
    end
    local json_obj = json.decode(ret)
    self.write_amplification_info.last_nand_written = phase_json_object(json_obj, 'nand_write', 1)
    self.write_amplification_info.last_host_written = phase_json_object(json_obj, 'host_write', 1)
    self.write_amplification_info.last_serial_number = self.SerialNumber
    self.write_amplification_info.first_start_flag = 0
end

local function pd_log_check_collect_startup(last_time, log_interval, start_time_in_hour, current_time)
    local gap = current_time - last_time
    gap = (gap >= 0) and gap or (2147483647 + gap) -- 2147483647是os.time()的最大值
    -- 启动硬盘日志收集有三种条件: 1、达到间隔时间; 2、达到首次启动时间; 3、首次启动时间为255, 即立即启动
    if last_time ~= 0 and gap > log_interval * 3600 then
        return true
    end
    if start_time_in_hour == common_def.STORAGE_INFO_INVALID_BYTE then
        return true
    end
    if start_time_in_hour > 23 then
        start_time_in_hour = 0
    end
    local start_time_in_sec = start_time_in_hour * 3600
    local cur_time = os.date('%H') * 3600 + os.date('%M') * 60 + os.date('%S');
    -- 与首次启动时间点5分钟偏差之内均认为是可以启动收集
    if last_time == 0 and current_time >= start_time_in_sec and  (cur_time - start_time_in_sec) < 300 then
        return true
    end
    return false
end

function c_drive:update_task()
    local ok, ret
    self:new_task({ TASK_UPDATE, self.ObjectName }):loop(function(task)
        if task.is_exit then
            return
        end
        ok, ret = pcall(sml.get_ctrl_init_state, self.RefControllerId)
        if not ok or ret ~= 2 then
            return
        end
        ok, ret = pcall(sml.get_pd_info, {Priority = 'Secondary'}, self.RefControllerId, self.device_id)
        if not ok then
            if ret ~= common_def.SML_ERR_CTRL_STATUS_INVALID and ret ~= common_def.SML_ERR_NULL_INFTERFACE then
                ctrl_commu_loss_monitor.get_instance():update(true, self.RefControllerId)
            end
            return
        end
        if task.is_exit then
            return
        end
        ctrl_commu_loss_monitor.get_instance():update(false, self.RefControllerId)
        self.on_pd_update:emit(ret)
        c_drives_object.get_instance().on_update_drive_temperature:emit(self.presence, ret, self.NodeId)
    end):set_timeout_ms(10000)
end

function c_drive:smart_update_task()
    local ok, ret
    self:new_task({ TASK_UPDATE_SMART, self.ObjectName }):loop(function(task)
        if task.is_exit then
            return
        end
        ok, ret = pcall(sml.get_ctrl_init_state, self.RefControllerId)
        if not ok or ret ~= 2 then
            return
        end
        if self.Protocol == common_def.DRIVE_PROTOCOL.SAS then
            ok, ret = pcall(sml.get_pd_sas_smart_info, self.RefControllerId, self.device_id)
            if not ok then
                if ret ~= common_def.SML_ERR_CTRL_STATUS_INVALID and ret ~= common_def.SML_ERR_NULL_INFTERFACE then
                    ctrl_commu_loss_monitor.get_instance():update(true, self.RefControllerId)
                end
                return
            end
            if task.is_exit then
                return
            end
            if string.len(ret.manufacture_data) > 0 then
                self.on_pd_update_smart:emit(ret)
            end
        end
    end):set_timeout_ms(300000)
end

function c_drive:dump_update_task()
    local ok, ret
    self:new_task({ TASK_DUMP_LOG, self.ObjectName }):loop(function(task)
        if task.is_exit then
            return
        end
        ok, ret = pcall(sml.get_ctrl_init_state, self.RefControllerId)
        if not ok or ret ~= 2 then
            return
        end
        local current_time = os.time()
        local dump_info = self.dump_info
        if not pd_log_check_collect_startup(dump_info.last_dump_time, dump_info.dump_interval,
            dump_info.first_start_time_in_hour, current_time) then
            return
        end
        dump_info.first_start_time_in_hour = 12 -- 默认首次日志收集时间为当天的12点
        dump_info.last_dump_time = current_time
        self.on_dump_log:emit()
        if task.is_exit then
            return
        end
    end):set_timeout_ms(TASK_DUMP_LOG_INTERVAL)
end

function c_drive:dump_update_task_precheck()
    self:new_task({ TASK_DUMP_LOG_PRECHECK, self.ObjectName }):loop(function(task)
        if task.is_exit then
            return
        end
        if not self.Protocol or self.Protocol == common_def.INVALID_U8 or not self.is_protocol_valid then
            return
        end
        self:dump_update_task()
        self:stop_task({ TASK_DUMP_LOG_PRECHECK, self.ObjectName })
    end):set_timeout_ms(TASK_DUMP_LOG_PRECHECK_INTERVAL)
end

function c_drive:start_update_tasks()
    self:update_task()
    self:smart_update_task()
    self:dump_update_task_precheck()
end

function c_drive:stop_update_tasks()
    self:stop_task({ TASK_UPDATE, self.ObjectName })
    self:stop_task({ TASK_UPDATE_SMART, self.ObjectName })
    self:stop_task({ TASK_DUMP_LOG, self.ObjectName })
    self:stop_task({ TASK_DUMP_LOG_PRECHECK, self.ObjectName })
    self.RebuildState = 0
    self:set_default_values()
end

function c_drive:is_abnormal()
    if self.FirmwareStatus == common_def.PD_STATE.UNCONFIGURED_BAD or
        self.FirmwareStatus == common_def.PD_STATE.FAILED or
        self.FirmwareStatus == common_def.PD_STATE.OFFLINE then
        return true
    else
        return false
    end
end

function c_drive:diagnose_continue()
    return (self.Presence == 1 and c_storageconfig.get_instance().power_on == 1)
end

function c_drive:diagnose_encl(context)
    if not context.oob_config_ready then
        return
    end
    local ok, ret = pcall(sml.pd_diag_encl_comm_error, self.RefControllerId, self.device_id, self.EnclosureId)
    if not ok or not ret then
        return
    end
    context.encl_comm_error = (ret.encl_comm_error == 1 and true and false)
end

function c_drive:diag_by_sense_error(context)
    local ok, ret = pcall(sml.pd_diag_sense_error, self.RefControllerId, self.device_id)
    if not ok or not ret then
        return
    end
    if ret.sense_error ~= 0 then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT,
        "Physical drive %s detected sense code error (%s)",
        self.Name, ret.io_buf)
    end
    context.sense_error = (ret.sense_error == 1 and true or false)
end

function c_drive:diag_by_smart(context)
    local ok, ret = pcall(sml.pd_diag_by_smart_log, self.RefControllerId, self.device_id)
    if not ok or not ret then
        return
    end

    if ret.pd_smart_log_error == 1 or ret.pd_crc_error == 1 then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT,
        "Physical drive %s detected smart attribute error (%s)",
        self.Name, ret.io_buf)
    end
    context.pd_smart_log_error = (ret.pd_smart_log_error == 1 and true or false)
end

function c_drive:diag_by_extended_error(context)
    local ok, ret = pcall(sml.pd_diag_by_ext_error_log, self.RefControllerId, self.device_id)
    if not ok or not ret then
        return
    end
    if ret.pd_smart_log_error == 1 then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT,
        "Physical drive %s detected extended error log error (%s)",
        self.Name, ret.io_buf)
    end
    context.pd_smart_log_error = (ret.pd_smart_log_error == 1 and true or false)
end

function c_drive:diag_by_log(context)

    -- 进行smart检测
    self:diag_by_smart(context)
    if context.pd_smart_log_error or not self:diagnose_continue() then
        return
    end

    -- SATA SSD不进行标准日志检测
    if self.Protocol == 1 then
        return
    end
    -- Error log检测
    self:diag_by_extended_error(context)
end

function c_drive:get_sata_self_test_status()
    local ok, ret = pcall(sml.pd_diag_get_sata_self_test_status, self.RefControllerId, self.device_id)
    if not ok or not ret then
        return
    end
    return ret.status
end

function c_drive:set_short_dst(context)
    local ret = self.identify_pd:set_short_dst(SMART_TEST_TYPE.SATA_SHORT_OFFLINE)
    -- 成功或当前有测试在进行
    if ret == SML_ERR_CODE_E.SML_SUCCESS or ret == SML_ERR_CODE_E.SML_ERR_PD_IS_SMART_TEST then
        return true
    -- 返回2d或2e错误，sense data检测硬盘故障
    elseif ret == SML_ERR_CODE_E.SML_ERR_PD_SCSI_DEVICE_FAULT then
        context.pd_short_dst_error = true
        context.io_buf = 'command fail because of the sense data error'
        return false
    -- 返回2d或2e错误，错误原因未知
    elseif ret == SML_ERR_CODE_E.SML_ERR_PD_SCSI_STATUS_FAIL then
        context.pd_passthru_cmd_fail = true
        return false
    -- 繁忙状态，等待2分钟后重试
    elseif ret == SML_ERR_CODE_E.SML_ERR_PD_SCSI_STATUS_BUSY then
        for _ = 1, 60 do
            skynet.sleep(200)
            if not self:diagnose_continue() then
                return false
            end
        end
        ret = self.identify_pd:set_short_dst(SMART_TEST_TYPE.SATA_SHORT_OFFLINE)
        -- a、获取成功直接返回，进行后续诊断
        if ret == SML_ERR_CODE_E.SML_SUCCESS or SML_ERR_CODE_E.SML_ERR_PD_IS_SMART_TEST then
            return true
        -- b、这三个错误码是0x2d、0x2e错误解析后的，重试之后还是异常则直接上报硬盘故障，不再进行后续诊断
        elseif ret == SML_ERR_CODE_E.SML_ERR_PD_SCSI_STATUS_FAIL or
            ret == SML_ERR_CODE_E.SML_ERR_PD_SCSI_STATUS_BUSY or
            ret == SML_ERR_CODE_E.SML_ERR_PD_SCSI_DEVICE_FAULT then
            context.pd_short_dst_error = true
            context.io_buf = 'command fail when try again'
            return false
        else
            return false
        end
    else
        return false
    end
end

function c_drive:self_test_status_continue(context)
    context.in_progress = 0
    local try_count = 0
    while try_count < 6 do
        local ret = self:get_sata_self_test_status()
        if ret == SML_ERR_CODE_E.SML_SUCCESS then
            return true
        end
        context.in_progres = 1
        -- 等待30s后继续循环查询结果
        for _ = 1, 15 do
            skynet.sleep(200)
            if not self:diagnose_continue() then
                return false
            end
        end
        try_count = try_count + 1
    end
    return false
end

function c_drive:diag_by_extended_self_test(context)
    local ok, ret = pcall(sml.pd_diag_by_ext_self_test_log, self.RefControllerId, self.device_id, context.in_progress)
    if not ok or not ret then
        return
    end
    context.pd_short_dst_error = (ret.pd_short_dst_error == 1 and true or false)
    context.io_buf = ret.io_buf
end

function c_drive:diag_by_short_dst(context)

    -- 1、状态查询并发送命令触发实时short dst
    local ret = self:get_sata_self_test_status()
    if ret ~= SML_ERR_CODE_E.SML_SUCCESS and ret ~= SML_ERR_CODE_E.SML_ERR_PD_IS_SMART_TEST then
        return
    end

    if not self:diagnose_continue() then
        return
    end

    if ret == SML_ERR_CODE_E.SML_SUCCESS then
        ret = self:set_short_dst(context)
        if not ret or not self:diagnose_continue() then
            goto EXIT
        end
    end

    -- 等待2分钟后继续循环查询结果
    for _ = 1, 60 do
        skynet.sleep(200)
        if not self:diagnose_continue() then
            goto EXIT
        end
    end

    -- 2、测试状态查询
    if not self:self_test_status_continue(context) then
        goto EXIT
    end

    -- 3、收集SATA硬盘的extended self test log，诊断
    self:diag_by_extended_self_test(context)

    ::EXIT::
    if context.pd_short_dst_error then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT,
        "Physical drive %s detected short dst error (%s)",
        self.Name, context.io_buf)
    end
end

function c_drive:diagnose_self(context)

    -- 1、sensecode检测
    log:notice('Start to diagnose the fault of %s by sense code.', self.Name)
    self:diag_by_sense_error(context)
    log:notice('End to diagnose the fault of %s by sense code.', self.Name)
    if context.sense_error or not self:diagnose_continue() then
        goto EXIT
    end

    -- 2、SMART属性与标准日志检测
    log:notice('Start to diagnose the fault of %s by oob smart log.', self.Name)
    self:diag_by_log(context)
    log:notice('End to diagnose the fault of %s by oob smart log.', self.Name)
    if context.pd_smart_log_error or not self:diagnose_continue() then
        goto EXIT
    end

    -- 3、进行short dst检测
    log:notice('Start to diagnose the fault of %s by short dst.', self.Name)
    self:diag_by_short_dst(context)
    log:notice('End to diagnose the fault of %s by short dst.', self.Name)

    ::EXIT::
    context.pd_fault = context.sense_error or context.pd_smart_log_error or context.pd_short_dst_error
end

function c_drive:generate_fault(assert, source)
    -- 不考虑来自BMA的
    local changed = false
    if source == 'OOB' then
        if self.fault_by_oob ~= assert then
            self.fault_by_oob = assert
            changed = true
        end
    elseif source == 'CPLD' then
        if self.fault_by_cpld ~= assert then
            self.fault_by_cpld = assert
            changed = true
        end
    end

    if changed then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT, "[%s] Physical drive %s is fault - %s",
            source, self.Name, assert and 'Asserted' or 'Deasserted')
    end

    self.Failure = (self.fault_by_oob or self.fault_by_cpld)
end

function c_drive:diagnose_link_phy(context)
    if not context.oob_config_ready then
        return
    end
    local ok, ret = pcall(sml.pd_diag_link_phy_error, self.RefControllerId, self.device_id, self.EnclosureId,
        context.pd_fault and 1 or 0, 1000, 10000)
    if not ok or not ret then
        return
    end
    context.link_fault =
        ret.ctrl_encl_link_error == 1 or
        ret.encl_pd_link_error == 1 or
        ret.ctrl_pd_link_error == 1 or
        ret.encl_encl_link_error == 1
    if context.link_fault then
        log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_LINK_PHY_ERROR,
            "Physical drive %s detected link phy error%s", self.Name, ret.io_buf)
    end
end

function c_drive:diagnose_task()
    local context = {
        oob_config_ready = c_storageconfig.get_instance():get_set_configuration_ready_flag(common_def.INVALID_U8)
    }
    log:maintenance(log.MLOG_INFO, common_def.FC__PUBLIC_OK, "Start to diagnose physical drive %s", self.Name)
    -- 1、背板诊断
    self:diagnose_encl(context)
    if not self:diagnose_continue() then
        goto EXIT
    end

    -- 2、硬盘诊断
    self:diagnose_self(context)
    if not self:diagnose_continue() then
        goto EXIT
    end

    -- 3、链路诊断
    self:diagnose_link_phy(context)
    if not self:diagnose_continue() then
        goto EXIT
    end

    -- 4、根据结果生成告警
    -- a、生成硬盘fault告警
    if context.pd_fault then
        self:generate_fault(true, 'OOB')
    end
    -- b、生成链路告警
    if context.link_fault then
    elseif not context.pd_fault and
            not context.encl_comm_error and
            (context.pd_crc_error or context.pd_passthru_cmd_fail) then
        -- 链路和背板无故障时，硬盘crc错误或者实时数据获取失败直接标记硬盘故障
        if context.pd_crc_error then
            log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT,
                "Physical drive %s detected crc error", self.Name)
        end

        if context.pd_passthru_cmd_fail then
            log:maintenance(log.MLOG_ERROR, common_def.FC_STORAGE_PHYSICAL_FAULT,
                "Physical drive %s failed to respond scsi command", self.Name)
        end

        self:generate_fault(true, 'OOB')
    end
    -- c、生成背板expander通信丢失告警
    if context.encl_comm_error then
    end
    ::EXIT::
    log:maintenance(log.MLOG_INFO, common_def.FC__PUBLIC_OK, "End to diagnose physical drive %s", self.Name)
end

-- 打印物理盘的健康状态到文件中
function c_drive:fprint_pd_health_status(fp_w)
    fp_w:write(string.format("%-40s : %s\n", "Health Status",
        common_def.HEALTH_STATUS_STR[self.Health] or "Unknown"))

    if (self.HealthCode & 0x0007) == 0 or self.HealthCode == common_def.INVALID_U16 then
        return
    end

    fp_w:write("\tErrors detected from RAID controller : :\n")
    if self.HealthCode & 1 == 1 then
        fp_w:write("\t\tMedia Errors\n")
    end

    if self.HealthCode & 2 == 2 then
        fp_w:write("\t\tPrefail Errors\n")
    end

    if self.HealthCode & 4 == 4 then
        fp_w:write("\t\tOther Errors\n")
    end
end

function c_drive:fprint_pd_drive_capacity(fp_w)
    local capacity_basic_unit <const> = 1024
    if not self.CapacityMiB then
        self.CapacityMiB = common_def.INVALID_U32
    end
    local cap = self.CapacityMiB
    local unit = 'N/A'
    if cap < capacity_basic_unit then
        unit = 'MB'
    elseif cap < capacity_basic_unit * capacity_basic_unit then
        cap = self.CapacityMiB / capacity_basic_unit
        unit = 'GB'
    elseif cap < capacity_basic_unit * capacity_basic_unit * capacity_basic_unit then
        cap = self.CapacityMiB / (capacity_basic_unit * capacity_basic_unit)
        unit = 'TB'
    else
        cap = self.CapacityMiB / (capacity_basic_unit * capacity_basic_unit * capacity_basic_unit)
        unit = 'PB'
    end

    local res = cap == common_def.INVALID_U32 and 'N/A' or (string.format('%.3f %s\n', cap, unit))
    fp_w:write(string.format("%-40s : %s\n", "Capacity", res))
end

function c_drive:fprint_pd_rebuild_state(fp_w)
    local res = ""
    if self.RebuildState == common_def.INVALID_U8 then
        res = 'N/A'
    elseif self.RebuildState == 0 then
        res = 'No'
    elseif self.RebuildState == 1 then
        if self.RebuildProgress == common_def.INVALID_U8 then
            if method_misc:test_controller_vendor(self.RefControllerTypeId, common_def.VENDER_PMC) then
                res = 'Yes (N/A)'
            else
                res = 'Yes (Unknown Progress)'
            end
        else
            res = string.format('%s (%s%%)', 'Yes', self.RebuildProgress)
        end
    else
        res = 'Unknown'
    end

    fp_w:write(string.format("%-40s : %s\n", "Rebuild in Progress", res))
end

function c_drive:fprint_pd_locate_state(fp_w)
    local locate_str = ''
    if self.FaultLed == 0 and self.LocateLed == 1 then
        locate_str = 'On'
    elseif self.FaultLed == common_def.INVALID_U8 or self.LocateLed == common_def.INVALID_U8 then
        locate_str = 'N/A'
    else
        locate_str = 'Off'
    end
    fp_w:write(string.format("%-40s : %s\n", "Location State", locate_str))
end

-- 打印物理盘的基本身份信息到文件中
function c_drive:dump_physical_drive(fp_w)
    fp_w:write("----------------------------------------------------------------------\n")
    local c = common_def
    fp_w:write(string.format("%-40s : %s\n", "ID", self.Id))
    fp_w:write(string.format("%-40s : %s\n", "Device Name", self.Name))
    fp_w:write(string.format("%-40s : %s\n", "Manufacturer", self.Manufacturer))
    fp_w:write(string.format("%-40s : %s\n", "Serial Number", self.SerialNumber))
    fp_w:write(string.format("%-40s : %s\n", "Model", self.Model))
    fp_w:write(string.format("%-40s : %s\n", "Firmware Version", self.Revision))
    self:fprint_pd_health_status(fp_w)
    fp_w:write(string.format("%-40s : %s\n", "Firmware State",
        c.PD_STATE_STR[self.FirmwareStatus] or ("UNDEFINED" .. self.FirmwareStatus)))
    fp_w:write(string.format("%-40s : %s\n", "Power State", c.POWER_STATE_STR[self.PowerState] or 'N/A'))
    fp_w:write(string.format("%-40s : %s\n", "Media Type", c.MEDIA_TYPE_STR[self.MediaType] or 'N/A'))
    fp_w:write(string.format("%-40s : %s\n", "Interface Type", c.DRIVE_PROTOCOL_STR[self.Protocol] or 'N/A'))
    fp_w:write(string.format("%-40s : %s\n", "Capable Speed", c.PD_PHYSICAL_SPEED[self.CapableSpeedGbs] or 'N/A'))
    fp_w:write(string.format("%-40s : %s\n", "Negotiated Speed", c.PD_PHYSICAL_SPEED[self.NegotiatedSpeedGbs] or 'N/A'))
    fp_w:write(string.format("%-40s : %s\n", "Drive Temperature",
        self.TemperatureCelsius == c.INVALID_U8 and 'N/A' or self.TemperatureCelsius))
    self:fprint_pd_drive_capacity(fp_w)
    fp_w:write(string.format("%-40s : %s\n", "Hot Spare", c.HOT_SPARE_TYPE_STR_NO_SPCAE[self.HotspareType] or 'N/A'))
    self:fprint_pd_rebuild_state(fp_w)
    fp_w:write(string.format("%-40s : %s\n", "Patrol Read in Progress", c.PATROL_STATE_STR[self.PatrolState] or 'N/A'))
    fp_w:write(string.format("%-40s : %s\n", "Remnant Media Wearout",
        self.PredictedMediaLifeLeftPercent == c.INVALID_U8 and 'N/A' or self.PredictedMediaLifeLeftPercent))
    fp_w:write(string.format("%-40s : %s\n", "Power-On Hours",
        self.PowerOnHours >= c.INVALID_U16 and 'N/A' or self.PowerOnHours))
    fp_w:write(string.format("%-40s : %s\n", "SAS Address(0)", self.SASAddress1))
    fp_w:write(string.format("%-40s : %s\n", "SAS Address(1)", self.SASAddress2))
    self:fprint_pd_locate_state(fp_w)
    fp_w:write(string.format("%-40s : %s\n", "Media Error Count",
        self.MediaErrorCount == c.INVALID_U32 and 'N/A' or self.MediaErrorCount))
    fp_w:write(string.format("%-40s : %s\n", "Prefail Error Count",
        self.PredictedFailCount == c.INVALID_U32 and 'N/A' or self.PredictedFailCount))
    fp_w:write(string.format("%-40s : %s\n", "Other Error Count",
        self.OtherErrorCount == c.INVALID_U32 and 'N/A' or self.OtherErrorCount))

    fp_w:write("----------------------------------------------------------------------\n")
end

function c_drive:set_locate_led_by_smc(locate_led_state)
    if locate_led_state == common_def.LOCATE then
        locate_led_state = common_def.TURN_ON_LOCATE
    end
    local ok, err = pcall(function()
        self.SetLocateLed = locate_led_state
    end)
    if not ok then
        log:error('set locate led by smc failed, err: %s', err)
        return false
    end
    return true
end
 
function c_drive:set_fault_led_by_smc(fault_led_state)
    local ok, err = pcall(function()
        self.SetFaultLed = fault_led_state
    end)
    if not ok then
        log:error('set fault led by smc failed, err: %s', err)
        return false
    end
    return true
end

return c_drive