diff options
Diffstat (limited to 'thermal/utils/thermal_stats_helper.cpp')
-rw-r--r-- | thermal/utils/thermal_stats_helper.cpp | 207 |
1 files changed, 180 insertions, 27 deletions
diff --git a/thermal/utils/thermal_stats_helper.cpp b/thermal/utils/thermal_stats_helper.cpp index bbd99542..d4571d90 100644 --- a/thermal/utils/thermal_stats_helper.cpp +++ b/thermal/utils/thermal_stats_helper.cpp @@ -18,7 +18,6 @@ #include <android-base/logging.h> #include <android/binder_manager.h> -#include <hardware/google/pixel/pixelstats/pixelatoms.pb.h> #include <algorithm> #include <numeric> @@ -75,26 +74,49 @@ int calculateThresholdBucket(const std::vector<T> &thresholds, T value) { return bucket; } +void resetCurrentTempStatus(CurrTempStatus *curr_temp_status, float new_temp) { + curr_temp_status->temp = new_temp; + curr_temp_status->start_time = boot_clock::now(); + curr_temp_status->repeat_count = 1; +} + } // namespace bool ThermalStatsHelper::initializeStats( const Json::Value &config, const std::unordered_map<std::string, SensorInfo> &sensor_info_map_, const std::unordered_map<std::string, CdevInfo> &cooling_device_info_map_) { - StatsConfig stats_config; - if (!ParseStatsConfig(config, sensor_info_map_, cooling_device_info_map_, &stats_config)) { - LOG(ERROR) << "Failed to parse stats config"; + StatsInfo<float> sensor_stats_info; + AbnormalStatsInfo abnormal_stats_info; + if (!ParseSensorStatsConfig(config, sensor_info_map_, &sensor_stats_info, + &abnormal_stats_info)) { + LOG(ERROR) << "Failed to parse sensor stats config"; + return false; + } + StatsInfo<int> cooling_device_request_info; + if (!ParseCoolingDeviceStatsConfig(config, cooling_device_info_map_, + &cooling_device_request_info)) { + LOG(ERROR) << "Failed to parse cooling device stats config"; + return false; + } + if (!initializeSensorTempStats(sensor_stats_info, sensor_info_map_)) { + LOG(ERROR) << "Failed to initialize sensor temp stats"; return false; } - bool is_initialized_ = - initializeSensorTempStats(stats_config.sensor_stats_info, sensor_info_map_) && - initializeSensorCdevRequestStats(stats_config.cooling_device_request_info, - sensor_info_map_, cooling_device_info_map_); - if (is_initialized_) { - last_total_stats_report_time = boot_clock::now(); - LOG(INFO) << "Thermal Stats Initialized Successfully"; + if (!initializeSensorCdevRequestStats(cooling_device_request_info, sensor_info_map_, + cooling_device_info_map_)) { + LOG(ERROR) << "Failed to initialize sensor cooling device request stats"; + return false; + } + if (!initializeSensorAbnormalityStats(abnormal_stats_info, sensor_info_map_)) { + LOG(ERROR) << "Failed to initialize sensor abnormal stats"; + return false; } - return is_initialized_; + + last_total_stats_report_time = boot_clock::now(); + abnormal_stats_reported_per_update_interval = 0; + LOG(INFO) << "Thermal Stats Initialized Successfully"; + return true; } bool ThermalStatsHelper::initializeSensorCdevRequestStats( @@ -161,7 +183,8 @@ bool ThermalStatsHelper::initializeSensorCdevRequestStats( bool ThermalStatsHelper::initializeSensorTempStats( const StatsInfo<float> &sensor_stats_info, const std::unordered_map<std::string, SensorInfo> &sensor_info_map_) { - std::unique_lock<std::shared_mutex> _lock(sensor_temp_stats_map_mutex_); + std::unique_lock<std::shared_mutex> _lock(sensor_stats_mutex_); + auto &temp_stats_map_ = sensor_stats.temp_stats_map_; const int severity_time_in_state_size = kThrottlingSeverityCount; for (const auto &[sensor, sensor_info] : sensor_info_map_) { // Record by severity @@ -169,7 +192,7 @@ bool ThermalStatsHelper::initializeSensorTempStats( isRecordByDefaultThreshold( sensor_stats_info.record_by_default_threshold_all_or_name_set_, sensor)) { // number of buckets = number of severity - sensor_temp_stats_map_[sensor].stats_by_default_threshold = + temp_stats_map_[sensor].stats_by_default_threshold = StatsRecord(severity_time_in_state_size); LOG(INFO) << "Sensor temp stats on basis of severity initialized for [" << sensor << "]"; @@ -178,8 +201,7 @@ bool ThermalStatsHelper::initializeSensorTempStats( // Record by custom threshold if (sensor_stats_info.record_by_threshold.count(sensor)) { for (const auto &threshold_list : sensor_stats_info.record_by_threshold.at(sensor)) { - sensor_temp_stats_map_[sensor].stats_by_custom_threshold.emplace_back( - threshold_list); + temp_stats_map_[sensor].stats_by_custom_threshold.emplace_back(threshold_list); LOG(INFO) << "Sensor temp stats on basis of threshold initialized for [" << sensor << "]"; } @@ -188,6 +210,54 @@ bool ThermalStatsHelper::initializeSensorTempStats( return true; } +bool ThermalStatsHelper::initializeSensorAbnormalityStats( + const AbnormalStatsInfo &abnormal_stats_info, + const std::unordered_map<std::string, SensorInfo> &sensor_info_map_) { + std::unique_lock<std::shared_mutex> _lock(sensor_stats_mutex_); + auto &temp_range_info_map_ = sensor_stats.temp_range_info_map_; + for (const auto &sensors_temp_range_info : abnormal_stats_info.sensors_temp_range_infos) { + const auto &temp_range_info_ptr = + std::make_shared<TempRangeInfo>(sensors_temp_range_info.temp_range_info); + for (const auto &sensor : sensors_temp_range_info.sensors) { + temp_range_info_map_[sensor] = temp_range_info_ptr; + } + } + auto &temp_stuck_info_map_ = sensor_stats.temp_stuck_info_map_; + for (const auto &sensors_temp_stuck_info : abnormal_stats_info.sensors_temp_stuck_infos) { + const auto &temp_stuck_info_ptr = + std::make_shared<TempStuckInfo>(sensors_temp_stuck_info.temp_stuck_info); + for (const auto &sensor : sensors_temp_stuck_info.sensors) { + temp_stuck_info_map_[sensor] = temp_stuck_info_ptr; + } + } + const auto &default_temp_range_info_ptr = + abnormal_stats_info.default_temp_range_info + ? std::make_shared<TempRangeInfo>( + abnormal_stats_info.default_temp_range_info.value()) + : nullptr; + const auto &default_temp_stuck_info_ptr = + abnormal_stats_info.default_temp_stuck_info + ? std::make_shared<TempStuckInfo>( + abnormal_stats_info.default_temp_stuck_info.value()) + : nullptr; + for (const auto &sensor_info : sensor_info_map_) { + const auto &sensor = sensor_info.first; + if (default_temp_range_info_ptr && !temp_range_info_map_.count(sensor)) + temp_range_info_map_[sensor] = default_temp_range_info_ptr; + if (default_temp_stuck_info_ptr && !temp_stuck_info_map_.count(sensor)) + temp_stuck_info_map_[sensor] = default_temp_stuck_info_ptr; + } + + for (const auto &sensor_temp_stuck_info : temp_stuck_info_map_) { + sensor_stats.curr_temp_status_map_[sensor_temp_stuck_info.first] = { + .temp = std::numeric_limits<float>::min(), + .start_time = boot_clock::time_point::min(), + .repeat_count = 0, + }; + } + return true; +} + void ThermalStatsHelper::updateStatsRecord(StatsRecord *stats_record, int new_state) { const auto now = boot_clock::now(); const auto cur_state_duration = std::chrono::duration_cast<std::chrono::milliseconds>( @@ -231,11 +301,13 @@ void ThermalStatsHelper::updateSensorCdevRequestStats(std::string_view sensor, void ThermalStatsHelper::updateSensorTempStatsByThreshold(std::string_view sensor, float temperature) { - std::unique_lock<std::shared_mutex> _lock(sensor_temp_stats_map_mutex_); - if (!sensor_temp_stats_map_.count(sensor.data())) { + std::unique_lock<std::shared_mutex> _lock(sensor_stats_mutex_); + verifySensorAbnormality(sensor, temperature); + auto &temp_stats_map_ = sensor_stats.temp_stats_map_; + if (!temp_stats_map_.count(sensor.data())) { return; } - auto &sensor_temp_stats = sensor_temp_stats_map_[sensor.data()]; + auto &sensor_temp_stats = temp_stats_map_[sensor.data()]; for (auto &stats_by_threshold : sensor_temp_stats.stats_by_custom_threshold) { int value = calculateThresholdBucket(stats_by_threshold.thresholds, temperature); if (value != stats_by_threshold.stats_record.cur_state) { @@ -256,11 +328,11 @@ void ThermalStatsHelper::updateSensorTempStatsByThreshold(std::string_view senso void ThermalStatsHelper::updateSensorTempStatsBySeverity(std::string_view sensor, const ThrottlingSeverity &severity) { - std::unique_lock<std::shared_mutex> _lock(sensor_temp_stats_map_mutex_); - if (sensor_temp_stats_map_.count(sensor.data()) && - sensor_temp_stats_map_[sensor.data()].stats_by_default_threshold.has_value()) { - auto &stats_record = - sensor_temp_stats_map_[sensor.data()].stats_by_default_threshold.value(); + std::unique_lock<std::shared_mutex> _lock(sensor_stats_mutex_); + auto &temp_stats_map_ = sensor_stats.temp_stats_map_; + if (temp_stats_map_.count(sensor.data()) && + temp_stats_map_[sensor.data()].stats_by_default_threshold.has_value()) { + auto &stats_record = temp_stats_map_[sensor.data()].stats_by_default_threshold.value(); int value = static_cast<int>(severity); if (value != stats_record.cur_state) { LOG(VERBOSE) << "Updating sensor stats for sensor: " << sensor.data() @@ -270,6 +342,52 @@ void ThermalStatsHelper::updateSensorTempStatsBySeverity(std::string_view sensor } } +void ThermalStatsHelper::verifySensorAbnormality(std::string_view sensor, float temp) { + LOG(VERBOSE) << "Verify sensor abnormality for " << sensor << " with temp " << temp; + if (sensor_stats.temp_range_info_map_.count(sensor.data())) { + const auto &temp_range_info = sensor_stats.temp_range_info_map_[sensor.data()]; + if (temp < temp_range_info->min_temp_threshold) { + LOG(ERROR) << "Outlier Temperature Detected, sensor: " << sensor.data() + << " temp: " << temp << " < " << temp_range_info->min_temp_threshold; + reportThermalAbnormality(ThermalSensorAbnormalityDetected::EXTREME_LOW_TEMP, sensor, + std::round(temp)); + } else if (temp > temp_range_info->max_temp_threshold) { + LOG(ERROR) << "Outlier Temperature Detected, sensor: " << sensor.data() + << " temp: " << temp << " > " << temp_range_info->max_temp_threshold; + reportThermalAbnormality(ThermalSensorAbnormalityDetected::EXTREME_HIGH_TEMP, sensor, + std::round(temp)); + } + } + if (sensor_stats.temp_stuck_info_map_.count(sensor.data())) { + const auto &temp_stuck_info = sensor_stats.temp_stuck_info_map_[sensor.data()]; + auto &curr_temp_status = sensor_stats.curr_temp_status_map_[sensor.data()]; + LOG(VERBOSE) << "Current Temp Status: temp=" << curr_temp_status.temp + << " repeat_count=" << curr_temp_status.repeat_count + << " start_time=" << curr_temp_status.start_time.time_since_epoch().count(); + if (std::fabs(curr_temp_status.temp - temp) <= kPrecisionThreshold) { + curr_temp_status.repeat_count++; + if (temp_stuck_info->min_polling_count <= curr_temp_status.repeat_count) { + auto time_elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>( + boot_clock::now() - curr_temp_status.start_time); + if (temp_stuck_info->min_stuck_duration <= time_elapsed_ms) { + LOG(ERROR) << "Stuck Temperature Detected, sensor: " << sensor.data() + << " temp: " << temp << " repeated " + << temp_stuck_info->min_polling_count << " times for " + << time_elapsed_ms.count() << "ms"; + if (reportThermalAbnormality(ThermalSensorAbnormalityDetected::SENSOR_STUCK, + sensor, std::round(temp))) { + // reset current status to verify for sensor stuck with start time as + // current polling + resetCurrentTempStatus(&curr_temp_status, temp); + } + } + } + } else { + resetCurrentTempStatus(&curr_temp_status, temp); + } + } +} + int ThermalStatsHelper::reportStats() { const auto curTime = boot_clock::now(); const auto since_last_total_stats_update_ms = @@ -290,13 +408,14 @@ int ThermalStatsHelper::reportStats() { int count_failed_reporting = reportAllSensorTempStats(stats_client) + reportAllSensorCdevRequestStats(stats_client); last_total_stats_report_time = curTime; + abnormal_stats_reported_per_update_interval = 0; return count_failed_reporting; } int ThermalStatsHelper::reportAllSensorTempStats(const std::shared_ptr<IStats> &stats_client) { int count_failed_reporting = 0; - std::unique_lock<std::shared_mutex> _lock(sensor_temp_stats_map_mutex_); - for (auto &[sensor, temp_stats] : sensor_temp_stats_map_) { + std::unique_lock<std::shared_mutex> _lock(sensor_stats_mutex_); + for (auto &[sensor, temp_stats] : sensor_stats.temp_stats_map_) { for (size_t threshold_set_idx = 0; threshold_set_idx < temp_stats.stats_by_custom_threshold.size(); threshold_set_idx++) { auto &stats_by_threshold = temp_stats.stats_by_custom_threshold[threshold_set_idx]; @@ -445,6 +564,40 @@ std::vector<int64_t> ThermalStatsHelper::processStatsRecordForReporting(StatsRec return stats_residency; } +bool ThermalStatsHelper::reportThermalAbnormality( + const ThermalSensorAbnormalityDetected::AbnormalityType &type, std::string_view name, + std::optional<int> reading) { + const auto value_str = reading.has_value() ? std::to_string(reading.value()) : "undefined"; + if (abnormal_stats_reported_per_update_interval >= kMaxAbnormalLoggingPerUpdateInterval) { + LOG(ERROR) << "Thermal abnormal atom logging rate limited for " << name.data() + << " with value " << value_str; + return true; + } + const std::shared_ptr<IStats> stats_client = getStatsService(); + if (!stats_client) { + LOG(ERROR) << "Unable to get AIDL Stats service"; + return false; + } + std::vector<VendorAtomValue> values(3); + values[ThermalSensorAbnormalityDetected::kTypeFieldNumber - kVendorAtomOffset] = + VendorAtomValue::make<VendorAtomValue::intValue>(type); + values[ThermalSensorAbnormalityDetected::kSensorFieldNumber - kVendorAtomOffset] = + VendorAtomValue::make<VendorAtomValue::stringValue>(name); + if (reading.has_value()) { + values[ThermalSensorAbnormalityDetected::kTempFieldNumber - kVendorAtomOffset] = + VendorAtomValue::make<VendorAtomValue::intValue>(reading.value()); + } + if (!reportAtom(stats_client, PixelAtoms::Atom::kThermalSensorAbnormalityDetected, + std::move(values))) { + LOG(ERROR) << "Failed to log thermal abnormal atom for " << name.data() << " with value " + << value_str; + return false; + } + LOG(INFO) << "Thermal abnormality reported for " << name.data() << " with value " << value_str; + abnormal_stats_reported_per_update_interval++; + return true; +} + bool ThermalStatsHelper::reportAtom(const std::shared_ptr<IStats> &stats_client, const int32_t &atom_id, std::vector<VendorAtomValue> &&values) { LOG(VERBOSE) << "Reporting thermal stats for atom_id " << atom_id; @@ -467,7 +620,7 @@ StatsRecord ThermalStatsHelper::restoreStatsRecordOnFailure( } std::unordered_map<std::string, SensorTempStats> ThermalStatsHelper::GetSensorTempStatsSnapshot() { - auto sensor_temp_stats_snapshot = sensor_temp_stats_map_; + auto sensor_temp_stats_snapshot = sensor_stats.temp_stats_map_; for (auto &sensor_temp_stats_pair : sensor_temp_stats_snapshot) { for (auto &temp_stats : sensor_temp_stats_pair.second.stats_by_custom_threshold) { // update the last unclosed entry and start new record with same state |