From 57aa18f52ea35ca74cd1a6c406a4abf04049b44e Mon Sep 17 00:00:00 2001
From: Ben <bene_thomas@web.de>
Date: Tue, 12 May 2020 22:48:30 +0200
Subject: [PATCH] Improve core timing accuracy (#5257)

* Improve core timing accuracy

* remove wrong global_ticks, use biggest ticks over all cores for GetGlobalTicks

* merge max slice length change
---
 src/core/core.cpp              | 27 +++++++++++++++++++--------
 src/core/core_timing.cpp       | 21 +++++++++++++++------
 src/core/core_timing.h         | 22 +++++++++++++---------
 src/core/hle/kernel/thread.cpp |  4 ++--
 src/tests/core/core_timing.cpp | 16 +++++++++++++---
 5 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/src/core/core.cpp b/src/core/core.cpp
index 47fafebe6..b22c288c1 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -148,7 +148,11 @@ System::ResultStatus System::RunLoop(bool tight_loop) {
     for (auto& cpu_core : cpu_cores) {
         if (cpu_core->GetTimer().GetTicks() < global_ticks) {
             s64 delay = global_ticks - cpu_core->GetTimer().GetTicks();
-            cpu_core->GetTimer().Advance(delay);
+            kernel->SetRunningCPU(cpu_core.get());
+            cpu_core->GetTimer().Advance();
+            cpu_core->PrepareReschedule();
+            kernel->GetThreadManager(cpu_core->GetID()).Reschedule();
+            cpu_core->GetTimer().SetNextSlice(delay);
             if (max_delay < delay) {
                 max_delay = delay;
                 current_core_to_execute = cpu_core.get();
@@ -156,10 +160,14 @@ System::ResultStatus System::RunLoop(bool tight_loop) {
         }
     }
 
-    if (max_delay > 0) {
+    // jit sometimes overshoot by a few ticks which might lead to a minimal desync in the cores.
+    // This small difference shouldn't make it necessary to sync the cores and would only cost
+    // performance. Thus we don't sync delays below min_delay
+    static constexpr s64 min_delay = 100;
+    if (max_delay > min_delay) {
         LOG_TRACE(Core_ARM11, "Core {} running (delayed) for {} ticks",
-                  current_core_to_execute->GetID(),
-                  current_core_to_execute->GetTimer().GetDowncount());
+                     current_core_to_execute->GetID(),
+                     current_core_to_execute->GetTimer().GetDowncount());
         if (running_core != current_core_to_execute) {
             running_core = current_core_to_execute;
             kernel->SetRunningCPU(running_core);
@@ -181,12 +189,15 @@ System::ResultStatus System::RunLoop(bool tight_loop) {
         // TODO: Make special check for idle since we can easily revert the time of idle cores
         s64 max_slice = Timing::MAX_SLICE_LENGTH;
         for (const auto& cpu_core : cpu_cores) {
+            kernel->SetRunningCPU(cpu_core.get());
+            cpu_core->GetTimer().Advance();
+            cpu_core->PrepareReschedule();
+            kernel->GetThreadManager(cpu_core->GetID()).Reschedule();
             max_slice = std::min(max_slice, cpu_core->GetTimer().GetMaxSliceLength());
         }
         for (auto& cpu_core : cpu_cores) {
-            cpu_core->GetTimer().Advance(max_slice);
-        }
-        for (auto& cpu_core : cpu_cores) {
+            cpu_core->GetTimer().SetNextSlice(max_slice);
+            auto start_ticks = cpu_core->GetTimer().GetTicks();
             LOG_TRACE(Core_ARM11, "Core {} running for {} ticks", cpu_core->GetID(),
                       cpu_core->GetTimer().GetDowncount());
             running_core = cpu_core.get();
@@ -204,8 +215,8 @@ System::ResultStatus System::RunLoop(bool tight_loop) {
                     cpu_core->Step();
                 }
             }
+            max_slice = cpu_core->GetTimer().GetTicks() - start_ticks;
         }
-        timing->AddToGlobalTicks(max_slice);
     }
 
     if (GDBStub::IsServerEnabled()) {
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 875412e83..3d77932b6 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -111,11 +111,15 @@ s64 Timing::GetTicks() const {
 }
 
 s64 Timing::GetGlobalTicks() const {
-    return global_timer;
+    const auto& timer =
+        std::max_element(timers.cbegin(), timers.cend(), [](const auto& a, const auto& b) {
+            return a->GetTicks() < b->GetTicks();
+        });
+    return (*timer)->GetTicks();
 }
 
 std::chrono::microseconds Timing::GetGlobalTimeUs() const {
-    return std::chrono::microseconds{GetTicks() * 1000000 / BASE_CLOCK_RATE_ARM11};
+    return std::chrono::microseconds{GetGlobalTicks() * 1000000 / BASE_CLOCK_RATE_ARM11};
 }
 
 std::shared_ptr<Timing::Timer> Timing::GetTimer(std::size_t cpu_id) {
@@ -161,21 +165,22 @@ void Timing::Timer::MoveEvents() {
 }
 
 s64 Timing::Timer::GetMaxSliceLength() const {
-    auto next_event = std::find_if(event_queue.begin(), event_queue.end(),
-                                   [&](const Event& e) { return e.time - executed_ticks > 0; });
+    const auto& next_event = event_queue.begin();
     if (next_event != event_queue.end()) {
+        ASSERT(next_event->time - executed_ticks > 0);
         return next_event->time - executed_ticks;
     }
     return MAX_SLICE_LENGTH;
 }
 
-void Timing::Timer::Advance(s64 max_slice_length) {
+void Timing::Timer::Advance() {
     MoveEvents();
 
     s64 cycles_executed = slice_length - downcount;
     idled_cycles = 0;
     executed_ticks += cycles_executed;
-    slice_length = max_slice_length;
+    slice_length = 0;
+    downcount = 0;
 
     is_timer_sane = true;
 
@@ -191,6 +196,10 @@ void Timing::Timer::Advance(s64 max_slice_length) {
     }
 
     is_timer_sane = false;
+}
+
+void Timing::Timer::SetNextSlice(s64 max_slice_length) {
+    slice_length = max_slice_length;
 
     // Still events left (scheduled in the future)
     if (!event_queue.empty()) {
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index 73b226cde..dadbe16e3 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -171,7 +171,13 @@ public:
         BOOST_SERIALIZATION_SPLIT_MEMBER()
     };
 
-    static constexpr int MAX_SLICE_LENGTH = 20000;
+    // currently Service::HID::pad_update_ticks is the smallest interval for an event that gets
+    // always scheduled. Therfore we use this as orientation for the MAX_SLICE_LENGTH
+    // For performance bigger slice length are desired, though this will lead to cores desync
+    // But we never want to schedule events into the current slice, because then cores might to
+    // run small slices to sync up again. This is especially important for events that are always
+    // scheduled and repated.
+    static constexpr int MAX_SLICE_LENGTH = BASE_CLOCK_RATE_ARM11 / 234;
 
     class Timer {
     public:
@@ -180,7 +186,9 @@ public:
 
         s64 GetMaxSliceLength() const;
 
-        void Advance(s64 max_slice_length = MAX_SLICE_LENGTH);
+        void Advance();
+
+        void SetNextSlice(s64 max_slice_length = MAX_SLICE_LENGTH);
 
         void Idle();
 
@@ -227,6 +235,9 @@ public:
         void serialize(Archive& ar, const unsigned int) {
             MoveEvents();
             // NOTE: ts_queue should be empty now
+            // TODO(SaveState): Remove the next two lines when we break compatibility
+            s64 x;
+            ar& x; // to keep compatibility with old save states that stored global_timer
             ar& event_queue;
             ar& event_fifo_id;
             ar& slice_length;
@@ -260,10 +271,6 @@ public:
 
     s64 GetGlobalTicks() const;
 
-    void AddToGlobalTicks(s64 ticks) {
-        global_timer += ticks;
-    }
-
     /**
      * Updates the value of the cpu clock scaling to the new percentage.
      */
@@ -274,8 +281,6 @@ public:
     std::shared_ptr<Timer> GetTimer(std::size_t cpu_id);
 
 private:
-    s64 global_timer = 0;
-
     // unordered_map stores each element separately as a linked list node so pointers to
     // elements remain stable regardless of rehashes/resizing.
     std::unordered_map<std::string, TimingEventType> event_types = {};
@@ -290,7 +295,6 @@ private:
     template <class Archive>
     void serialize(Archive& ar, const unsigned int file_version) {
         // event_types set during initialization of other things
-        ar& global_timer;
         ar& timers;
         if (file_version == 0) {
             std::shared_ptr<Timer> x;
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 60d202631..80111c24c 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -111,7 +111,7 @@ void ThreadManager::SwitchContext(Thread* new_thread) {
     // Save context for previous thread
     if (previous_thread) {
         previous_process = previous_thread->owner_process;
-        previous_thread->last_running_ticks = timing.GetGlobalTicks();
+        previous_thread->last_running_ticks = cpu->GetTimer().GetTicks();
         cpu->SaveContext(previous_thread->context);
 
         if (previous_thread->status == ThreadStatus::Running) {
@@ -344,7 +344,7 @@ ResultVal<std::shared_ptr<Thread>> KernelSystem::CreateThread(
     thread->entry_point = entry_point;
     thread->stack_top = stack_top;
     thread->nominal_priority = thread->current_priority = priority;
-    thread->last_running_ticks = timing.GetGlobalTicks();
+    thread->last_running_ticks = timing.GetTimer(processor_id)->GetTicks();
     thread->processor_id = processor_id;
     thread->wait_objects.clear();
     thread->wait_address = 0;
diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp
index 8b34ba9d7..b3247aed8 100644
--- a/src/tests/core/core_timing.cpp
+++ b/src/tests/core/core_timing.cpp
@@ -13,7 +13,8 @@
 
 // Numbers are chosen randomly to make sure the correct one is given.
 static constexpr std::array<u64, 5> CB_IDS{{42, 144, 93, 1026, UINT64_C(0xFFFF7FFFF7FFFF)}};
-static constexpr int MAX_SLICE_LENGTH = 20000; // Copied from CoreTiming internals
+static constexpr int MAX_SLICE_LENGTH =
+    BASE_CLOCK_RATE_ARM11 / 234; // Copied from CoreTiming internals
 
 static std::bitset<CB_IDS.size()> callbacks_ran_flags;
 static u64 expected_callback = 0;
@@ -36,7 +37,9 @@ static void AdvanceAndCheck(Core::Timing& timing, u32 idx, int downcount, int ex
 
     timing.GetTimer(0)->AddTicks(timing.GetTimer(0)->GetDowncount() -
                                  cpu_downcount); // Pretend we executed X cycles of instructions.
+
     timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice();
 
     REQUIRE(decltype(callbacks_ran_flags)().set(idx) == callbacks_ran_flags);
     REQUIRE(downcount == timing.GetTimer(0)->GetDowncount());
@@ -53,6 +56,7 @@ TEST_CASE("CoreTiming[BasicOrder]", "[core]") {
 
     // Enter slice 0
     timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice();
 
     // D -> B -> C -> A -> E
     timing.ScheduleEvent(1000, cb_a, CB_IDS[0], 0);
@@ -106,6 +110,7 @@ TEST_CASE("CoreTiming[SharedSlot]", "[core]") {
 
     // Enter slice 0
     timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice();
     REQUIRE(1000 == timing.GetTimer(0)->GetDowncount());
 
     callbacks_ran_flags = 0;
@@ -113,6 +118,7 @@ TEST_CASE("CoreTiming[SharedSlot]", "[core]") {
     lateness = 0;
     timing.GetTimer(0)->AddTicks(timing.GetTimer(0)->GetDowncount());
     timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice();
     REQUIRE(MAX_SLICE_LENGTH == timing.GetTimer(0)->GetDowncount());
     REQUIRE(0x1FULL == callbacks_ran_flags.to_ullong());
 }
@@ -125,6 +131,7 @@ TEST_CASE("CoreTiming[PredictableLateness]", "[core]") {
 
     // Enter slice 0
     timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice();
 
     timing.ScheduleEvent(100, cb_a, CB_IDS[0], 0);
     timing.ScheduleEvent(200, cb_b, CB_IDS[1], 0);
@@ -161,6 +168,7 @@ TEST_CASE("CoreTiming[ChainScheduling]", "[core]") {
 
     // Enter slice 0
     timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice();
 
     timing.ScheduleEvent(800, cb_a, CB_IDS[0], 0);
     timing.ScheduleEvent(1000, cb_b, CB_IDS[1], 0);
@@ -174,14 +182,16 @@ TEST_CASE("CoreTiming[ChainScheduling]", "[core]") {
     REQUIRE(2 == reschedules);
 
     timing.GetTimer(0)->AddTicks(timing.GetTimer(0)->GetDowncount());
-    timing.GetTimer(0)->Advance(); // cb_rs
+    timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice(); // cb_rs
     REQUIRE(1 == reschedules);
     REQUIRE(200 == timing.GetTimer(0)->GetDowncount());
 
     AdvanceAndCheck(timing, 2, 800); // cb_c
 
     timing.GetTimer(0)->AddTicks(timing.GetTimer(0)->GetDowncount());
-    timing.GetTimer(0)->Advance(); // cb_rs
+    timing.GetTimer(0)->Advance();
+    timing.GetTimer(0)->SetNextSlice(); // cb_rs
     REQUIRE(0 == reschedules);
     REQUIRE(MAX_SLICE_LENGTH == timing.GetTimer(0)->GetDowncount());
 }