diff --git a/src/citra/config.cpp b/src/citra/config.cpp index 0ec7bc6b2..b4e3a2ce9 100644 --- a/src/citra/config.cpp +++ b/src/citra/config.cpp @@ -95,6 +95,11 @@ void Config::ReadValues() { // Renderer Settings::values.use_hw_renderer = sdl2_config->GetBoolean("Renderer", "use_hw_renderer", true); + Settings::values.use_hw_shader = sdl2_config->GetBoolean("Renderer", "use_hw_shader", true); + Settings::values.shaders_accurate_gs = + sdl2_config->GetBoolean("Renderer", "shaders_accurate_gs", true); + Settings::values.shaders_accurate_mul = + sdl2_config->GetBoolean("Renderer", "shaders_accurate_mul", false); Settings::values.use_shader_jit = sdl2_config->GetBoolean("Renderer", "use_shader_jit", true); Settings::values.resolution_factor = static_cast(sdl2_config->GetInteger("Renderer", "resolution_factor", 1)); diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h index 18be9ec25..7179d6f94 100644 --- a/src/citra/default_ini.h +++ b/src/citra/default_ini.h @@ -77,6 +77,18 @@ use_cpu_jit = # 0: Software, 1 (default): Hardware use_hw_renderer = +# Whether to use hardware shaders to emulate 3DS shaders +# 0: Software, 1 (default): Hardware +use_hw_shader = + +# Whether to use accurate multiplication in hardware shaders +# 0: Off (Default. Faster, but causes issues in some games) 1: On (Slower, but correct) +shaders_accurate_mul = + +# Whether to fallback to software for geometry shaders +# 0: Off (Faster, but causes issues in some games) 1: On (Default. Slower, but correct) +shaders_accurate_gs = + # Whether to use the Just-In-Time (JIT) compiler for shader emulation # 0: Interpreter (slow), 1 (default): JIT (fast) use_shader_jit = diff --git a/src/citra_qt/configuration/config.cpp b/src/citra_qt/configuration/config.cpp index ca99856b5..bdb296659 100644 --- a/src/citra_qt/configuration/config.cpp +++ b/src/citra_qt/configuration/config.cpp @@ -83,6 +83,10 @@ void Config::ReadValues() { qt_config->beginGroup("Renderer"); Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", true).toBool(); + Settings::values.use_hw_shader = qt_config->value("use_hw_shader", true).toBool(); + Settings::values.shaders_accurate_gs = qt_config->value("shaders_accurate_gs", true).toBool(); + Settings::values.shaders_accurate_mul = + qt_config->value("shaders_accurate_mul", false).toBool(); Settings::values.use_shader_jit = qt_config->value("use_shader_jit", true).toBool(); Settings::values.resolution_factor = static_cast(qt_config->value("resolution_factor", 1).toInt()); @@ -272,6 +276,9 @@ void Config::SaveValues() { qt_config->beginGroup("Renderer"); qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); + qt_config->setValue("use_hw_shader", Settings::values.use_hw_shader); + qt_config->setValue("shaders_accurate_gs", Settings::values.shaders_accurate_gs); + qt_config->setValue("shaders_accurate_mul", Settings::values.shaders_accurate_mul); qt_config->setValue("use_shader_jit", Settings::values.use_shader_jit); qt_config->setValue("resolution_factor", Settings::values.resolution_factor); qt_config->setValue("use_vsync", Settings::values.use_vsync); diff --git a/src/core/settings.cpp b/src/core/settings.cpp index f7467e573..3cc38b982 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -22,6 +22,9 @@ void Apply() { VideoCore::g_hw_renderer_enabled = values.use_hw_renderer; VideoCore::g_shader_jit_enabled = values.use_shader_jit; + VideoCore::g_hw_shader_enabled = values.use_hw_shader; + VideoCore::g_hw_shader_accurate_gs = values.shaders_accurate_gs; + VideoCore::g_hw_shader_accurate_mul = values.shaders_accurate_mul; if (VideoCore::g_emu_window) { auto layout = VideoCore::g_emu_window->GetFramebufferLayout(); diff --git a/src/core/settings.h b/src/core/settings.h index d574a1578..b57a1cbed 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -107,6 +107,9 @@ struct Values { // Renderer bool use_hw_renderer; + bool use_hw_shader; + bool shaders_accurate_gs; + bool shaders_accurate_mul; bool use_shader_jit; u16 resolution_factor; bool use_vsync; diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 8612a758f..a2d231bf3 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -286,6 +286,38 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); + PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; + + bool accelerate_draw = VideoCore::g_hw_shader_enabled && primitive_assembler.IsEmpty(); + + if (regs.pipeline.use_gs == PipelineRegs::UseGS::No) { + auto topology = primitive_assembler.GetTopology(); + if (topology == PipelineRegs::TriangleTopology::Shader || + topology == PipelineRegs::TriangleTopology::List) { + accelerate_draw = accelerate_draw && (regs.pipeline.num_vertices % 3) == 0; + } + // TODO (wwylele): for Strip/Fan topology, if the primitive assember is not restarted + // after this draw call, the buffered vertex from this draw should "leak" to the next + // draw, in which case we should buffer the vertex into the software primitive assember, + // or disable accelerate draw completely. However, there is not game found yet that does + // this, so this is left unimplemented for now. Revisit this when an issue is found in + // games. + } else { + if (VideoCore::g_hw_shader_accurate_gs) { + accelerate_draw = false; + } + } + + bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); + + if (accelerate_draw && + VideoCore::g_renderer->Rasterizer()->AccelerateDrawBatch(is_indexed)) { + if (g_debug_context) { + g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); + } + break; + } + // Processes information about internal vertex attributes to figure out how a vertex is // loaded. // Later, these can be compiled and cached. @@ -294,15 +326,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { Shader::OutputVertex::ValidateSemantics(regs.rasterizer); // Load vertices - bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); - const auto& index_info = regs.pipeline.index_array; const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset); const u16* index_address_16 = reinterpret_cast(index_address_8); bool index_u16 = index_info.format != 0; - PrimitiveAssembler& primitive_assembler = g_state.primitive_assembler; - if (g_debug_context && g_debug_context->recorder) { for (int i = 0; i < 3; ++i) { const auto texture = regs.texturing.GetTextures()[i]; diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 52fc84363..b282ccca0 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -71,6 +71,16 @@ void PrimitiveAssembler::Reconfigure(PipelineRegs::TriangleTopology this->topology = topology; } +template +bool PrimitiveAssembler::IsEmpty() const { + return buffer_index == 0 && strip_ready == false; +} + +template +PipelineRegs::TriangleTopology PrimitiveAssembler::GetTopology() const { + return topology; +} + // explicitly instantiate use cases template struct PrimitiveAssembler; diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 0d6219292..fd5445aa8 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h @@ -45,6 +45,16 @@ struct PrimitiveAssembler { */ void Reconfigure(PipelineRegs::TriangleTopology topology); + /** + * Returns whether the PrimitiveAssembler has an empty internal buffer. + */ + bool IsEmpty() const; + + /** + * Returns the current topology. + */ + PipelineRegs::TriangleTopology GetTopology() const; + private: PipelineRegs::TriangleTopology topology; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 35a6594ad..d1f09fccc 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -66,5 +66,10 @@ public: ScreenInfo& screen_info) { return false; } + + /// Attempt to draw using hardware shaders + virtual bool AccelerateDrawBatch(bool is_indexed) { + return false; + } }; } // namespace VideoCore diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 5571346f4..ee685c2ed 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -12,6 +12,7 @@ #include "common/logging/log.h" #include "common/math_util.h" #include "common/microprofile.h" +#include "common/scope_exit.h" #include "common/vector_math.h" #include "core/hw/gpu.h" #include "video_core/pica_state.h" @@ -26,13 +27,17 @@ using PixelFormat = SurfaceParams::PixelFormat; using SurfaceType = SurfaceParams::SurfaceType; +MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Array Setup", MP_RGB(255, 128, 0)); +MICROPROFILE_DEFINE(OpenGL_VS, "OpenGL", "Vertex Shader Setup", MP_RGB(192, 128, 128)); +MICROPROFILE_DEFINE(OpenGL_GS, "OpenGL", "Geometry Shader Setup", MP_RGB(128, 192, 128)); MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255)); MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true), vertex_buffer(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE), - uniform_buffer(GL_UNIFORM_BUFFER, UNIFORM_BUFFER_SIZE) { + uniform_buffer(GL_UNIFORM_BUFFER, UNIFORM_BUFFER_SIZE), + index_buffer(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE) { // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0 state.clip_distance[0] = true; @@ -46,13 +51,9 @@ RasterizerOpenGL::RasterizerOpenGL() texture_cube_sampler.Create(); state.texture_cube_unit.sampler = texture_cube_sampler.sampler.handle; - // Generate VBO, VAO and UBO - vertex_array.Create(); - - state.draw.vertex_array = vertex_array.handle; - state.draw.vertex_buffer = vertex_buffer.GetHandle(); - state.draw.uniform_buffer = uniform_buffer.GetHandle(); - state.Apply(); + // Generate VAO + sw_vao.Create(); + hw_vao.Create(); uniform_block_data.dirty = true; @@ -67,10 +68,18 @@ RasterizerOpenGL::RasterizerOpenGL() uniform_block_data.proctex_diff_lut_dirty = true; glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment); + uniform_size_aligned_vs = + Common::AlignUp(sizeof(VSUniformData), uniform_buffer_alignment); + uniform_size_aligned_gs = + Common::AlignUp(sizeof(GSUniformData), uniform_buffer_alignment); uniform_size_aligned_fs = Common::AlignUp(sizeof(UniformData), uniform_buffer_alignment); - // Set vertex attributes + // Set vertex attributes for software shader path + state.draw.vertex_array = sw_vao.handle; + state.draw.vertex_buffer = vertex_buffer.GetHandle(); + state.Apply(); + glVertexAttribPointer(GLShader::ATTRIBUTE_POSITION, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, position)); glEnableVertexAttribArray(GLShader::ATTRIBUTE_POSITION); @@ -176,6 +185,11 @@ RasterizerOpenGL::RasterizerOpenGL() glActiveTexture(TextureUnits::ProcTexDiffLUT.Enum()); glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle); + // Bind index buffer for hardware shader path + state.draw.vertex_array = hw_vao.handle; + state.Apply(); + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer.GetHandle()); + shader_program_manager = std::make_unique(GLAD_GL_ARB_separate_shader_objects); @@ -258,10 +272,264 @@ void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0, vertex_batch.emplace_back(v2, AreQuaternionsOpposite(v0.quat, v2.quat)); } +static constexpr std::array vs_attrib_types{ + GL_BYTE, // VertexAttributeFormat::BYTE + GL_UNSIGNED_BYTE, // VertexAttributeFormat::UBYTE + GL_SHORT, // VertexAttributeFormat::SHORT + GL_FLOAT // VertexAttributeFormat::FLOAT +}; + +struct VertexArrayInfo { + u32 vs_input_index_min; + u32 vs_input_index_max; + u32 vs_input_size; +}; + +RasterizerOpenGL::VertexArrayInfo RasterizerOpenGL::AnalyzeVertexArray(bool is_indexed) { + const auto& regs = Pica::g_state.regs; + const auto& vertex_attributes = regs.pipeline.vertex_attributes; + + u32 vertex_min; + u32 vertex_max; + if (is_indexed) { + const auto& index_info = regs.pipeline.index_array; + PAddr address = vertex_attributes.GetPhysicalBaseAddress() + index_info.offset; + const u8* index_address_8 = Memory::GetPhysicalPointer(address); + const u16* index_address_16 = reinterpret_cast(index_address_8); + bool index_u16 = index_info.format != 0; + + vertex_min = 0xFFFF; + vertex_max = 0; + std::size_t size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1); + res_cache.FlushRegion(address, size, nullptr); + for (u32 index = 0; index < regs.pipeline.num_vertices; ++index) { + u32 vertex = index_u16 ? index_address_16[index] : index_address_8[index]; + vertex_min = std::min(vertex_min, vertex); + vertex_max = std::max(vertex_max, vertex); + } + } else { + vertex_min = regs.pipeline.vertex_offset; + vertex_max = regs.pipeline.vertex_offset + regs.pipeline.num_vertices - 1; + } + + u32 vertex_num = vertex_max - vertex_min + 1; + u32 vs_input_size = 0; + for (auto& loader : vertex_attributes.attribute_loaders) { + if (loader.component_count != 0) { + vs_input_size += loader.byte_count * vertex_num; + } + } + + return {vertex_min, vertex_max, vs_input_size}; +} + +void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset, + GLuint vs_input_index_min, GLuint vs_input_index_max) { + MICROPROFILE_SCOPE(OpenGL_VAO); + const auto& regs = Pica::g_state.regs; + const auto& vertex_attributes = regs.pipeline.vertex_attributes; + PAddr base_address = vertex_attributes.GetPhysicalBaseAddress(); + + state.draw.vertex_array = hw_vao.handle; + state.draw.vertex_buffer = vertex_buffer.GetHandle(); + state.Apply(); + + std::array enable_attributes{}; + + for (const auto& loader : vertex_attributes.attribute_loaders) { + if (loader.component_count == 0 || loader.byte_count == 0) { + continue; + } + + u32 offset = 0; + for (u32 comp = 0; comp < loader.component_count && comp < 12; ++comp) { + u32 attribute_index = loader.GetComponent(comp); + if (attribute_index < 12) { + if (vertex_attributes.GetNumElements(attribute_index) != 0) { + offset = Common::AlignUp( + offset, vertex_attributes.GetElementSizeInBytes(attribute_index)); + + u32 input_reg = regs.vs.GetRegisterForAttribute(attribute_index); + GLint size = vertex_attributes.GetNumElements(attribute_index); + GLenum type = vs_attrib_types[static_cast( + vertex_attributes.GetFormat(attribute_index))]; + GLsizei stride = loader.byte_count; + glVertexAttribPointer(input_reg, size, type, GL_FALSE, stride, + reinterpret_cast(buffer_offset + offset)); + enable_attributes[input_reg] = true; + + offset += vertex_attributes.GetStride(attribute_index); + } + } else { + // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, + // respectively + offset = Common::AlignUp(offset, 4); + offset += (attribute_index - 11) * 4; + } + } + + PAddr data_addr = + base_address + loader.data_offset + (vs_input_index_min * loader.byte_count); + + u32 vertex_num = vs_input_index_max - vs_input_index_min + 1; + u32 data_size = loader.byte_count * vertex_num; + + res_cache.FlushRegion(data_addr, data_size, nullptr); + std::memcpy(array_ptr, Memory::GetPhysicalPointer(data_addr), data_size); + + array_ptr += data_size; + buffer_offset += data_size; + } + + for (std::size_t i = 0; i < enable_attributes.size(); ++i) { + if (enable_attributes[i] != hw_vao_enabled_attributes[i]) { + if (enable_attributes[i]) { + glEnableVertexAttribArray(i); + } else { + glDisableVertexAttribArray(i); + } + hw_vao_enabled_attributes[i] = enable_attributes[i]; + } + + if (vertex_attributes.IsDefaultAttribute(i)) { + u32 reg = regs.vs.GetRegisterForAttribute(i); + if (!enable_attributes[reg]) { + const auto& attr = Pica::g_state.input_default_attributes.attr[i]; + glVertexAttrib4f(reg, attr.x.ToFloat32(), attr.y.ToFloat32(), attr.z.ToFloat32(), + attr.w.ToFloat32()); + } + } + } +} + +bool RasterizerOpenGL::SetupVertexShader() { + MICROPROFILE_SCOPE(OpenGL_VS); + GLShader::PicaVSConfig vs_config(Pica::g_state.regs, Pica::g_state.vs); + return shader_program_manager->UseProgrammableVertexShader(vs_config, Pica::g_state.vs); +} + +bool RasterizerOpenGL::SetupGeometryShader() { + MICROPROFILE_SCOPE(OpenGL_GS); + const auto& regs = Pica::g_state.regs; + if (regs.pipeline.use_gs == Pica::PipelineRegs::UseGS::No) { + GLShader::PicaFixedGSConfig gs_config(regs); + shader_program_manager->UseFixedGeometryShader(gs_config); + return true; + } else { + GLShader::PicaGSConfig gs_config(regs, Pica::g_state.gs); + return shader_program_manager->UseProgrammableGeometryShader(gs_config, Pica::g_state.gs); + } +} + +bool RasterizerOpenGL::AccelerateDrawBatch(bool is_indexed) { + const auto& regs = Pica::g_state.regs; + if (regs.pipeline.use_gs != Pica::PipelineRegs::UseGS::No) { + if (regs.pipeline.gs_config.mode != Pica::PipelineRegs::GSMode::Point) { + return false; + } + if (regs.pipeline.triangle_topology != Pica::PipelineRegs::TriangleTopology::Shader) { + return false; + } + } + + if (!SetupVertexShader()) + return false; + + if (!SetupGeometryShader()) + return false; + + return Draw(true, is_indexed); +} + +static GLenum GetCurrentPrimitiveMode(bool use_gs) { + const auto& regs = Pica::g_state.regs; + if (use_gs) { + switch ((regs.gs.max_input_attribute_index + 1) / + (regs.pipeline.vs_outmap_total_minus_1_a + 1)) { + case 1: + return GL_POINTS; + case 2: + return GL_LINES; + case 4: + return GL_LINES_ADJACENCY; + case 3: + return GL_TRIANGLES; + case 6: + return GL_TRIANGLES_ADJACENCY; + default: + UNREACHABLE(); + } + } else { + switch (regs.pipeline.triangle_topology) { + case Pica::PipelineRegs::TriangleTopology::Shader: + case Pica::PipelineRegs::TriangleTopology::List: + return GL_TRIANGLES; + case Pica::PipelineRegs::TriangleTopology::Fan: + return GL_TRIANGLE_FAN; + case Pica::PipelineRegs::TriangleTopology::Strip: + return GL_TRIANGLE_STRIP; + default: + UNREACHABLE(); + } + } +} + +bool RasterizerOpenGL::AccelerateDrawBatchInternal(bool is_indexed, bool use_gs) { + const auto& regs = Pica::g_state.regs; + GLenum primitive_mode = GetCurrentPrimitiveMode(use_gs); + + auto [vs_input_index_min, vs_input_index_max, vs_input_size] = AnalyzeVertexArray(is_indexed); + + if (vs_input_size > VERTEX_BUFFER_SIZE) { + NGLOG_WARNING(Render_OpenGL, "Too large vertex input size {}", vs_input_size); + return false; + } + + state.draw.vertex_buffer = vertex_buffer.GetHandle(); + state.Apply(); + + u8* buffer_ptr; + GLintptr buffer_offset; + std::tie(buffer_ptr, buffer_offset, std::ignore) = vertex_buffer.Map(vs_input_size, 4); + SetupVertexArray(buffer_ptr, buffer_offset, vs_input_index_min, vs_input_index_max); + vertex_buffer.Unmap(vs_input_size); + + shader_program_manager->ApplyTo(state); + state.Apply(); + + if (is_indexed) { + bool index_u16 = regs.pipeline.index_array.format != 0; + std::size_t index_buffer_size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1); + + if (index_buffer_size > INDEX_BUFFER_SIZE) { + NGLOG_WARNING(Render_OpenGL, "Too large index input size {}", index_buffer_size); + return false; + } + + const u8* index_data = + Memory::GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() + + regs.pipeline.index_array.offset); + std::tie(buffer_ptr, buffer_offset, std::ignore) = index_buffer.Map(index_buffer_size, 4); + std::memcpy(buffer_ptr, index_data, index_buffer_size); + index_buffer.Unmap(index_buffer_size); + + glDrawRangeElementsBaseVertex( + primitive_mode, vs_input_index_min, vs_input_index_max, regs.pipeline.num_vertices, + index_u16 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, + reinterpret_cast(buffer_offset), -static_cast(vs_input_index_min)); + } else { + glDrawArrays(primitive_mode, 0, regs.pipeline.num_vertices); + } + return true; +} + void RasterizerOpenGL::DrawTriangles() { if (vertex_batch.empty()) return; + Draw(false, false); +} +bool RasterizerOpenGL::Draw(bool accelerate, bool is_indexed) { MICROPROFILE_SCOPE(OpenGL_Drawing); const auto& regs = Pica::g_state.regs; @@ -474,7 +742,8 @@ void RasterizerOpenGL::DrawTriangles() { } // Sync the uniform data - UploadUniforms(); + const bool use_gs = regs.pipeline.use_gs == Pica::PipelineRegs::UseGS::Yes; + UploadUniforms(accelerate, use_gs); // Viewport can have negative offsets or larger // dimensions than our framebuffer sub-rect. @@ -487,22 +756,31 @@ void RasterizerOpenGL::DrawTriangles() { state.scissor.height = draw_rect.GetHeight(); state.Apply(); - shader_program_manager->UseTrivialVertexShader(); - shader_program_manager->UseTrivialGeometryShader(); - shader_program_manager->ApplyTo(state); - state.Apply(); - // Draw the vertex batch - size_t max_vertices = 3 * (vertex_buffer.GetSize() / (3 * sizeof(HardwareVertex))); - for (size_t base_vertex = 0; base_vertex < vertex_batch.size(); base_vertex += max_vertices) { - size_t vertices = std::min(max_vertices, vertex_batch.size() - base_vertex); - size_t vertex_size = vertices * sizeof(HardwareVertex); - u8* vbo; - GLintptr offset; - std::tie(vbo, offset, std::ignore) = vertex_buffer.Map(vertex_size, sizeof(HardwareVertex)); - memcpy(vbo, vertex_batch.data() + base_vertex, vertex_size); - vertex_buffer.Unmap(vertex_size); - glDrawArrays(GL_TRIANGLES, offset / sizeof(HardwareVertex), (GLsizei)vertices); + bool succeeded = true; + if (accelerate) { + succeeded = AccelerateDrawBatchInternal(is_indexed, use_gs); + } else { + state.draw.vertex_array = sw_vao.handle; + state.draw.vertex_buffer = vertex_buffer.GetHandle(); + shader_program_manager->UseTrivialVertexShader(); + shader_program_manager->UseTrivialGeometryShader(); + shader_program_manager->ApplyTo(state); + state.Apply(); + + std::size_t max_vertices = 3 * (VERTEX_BUFFER_SIZE / (3 * sizeof(HardwareVertex))); + for (std::size_t base_vertex = 0; base_vertex < vertex_batch.size(); + base_vertex += max_vertices) { + std::size_t vertices = std::min(max_vertices, vertex_batch.size() - base_vertex); + std::size_t vertex_size = vertices * sizeof(HardwareVertex); + u8* vbo; + GLintptr offset; + std::tie(vbo, offset, std::ignore) = + vertex_buffer.Map(vertex_size, sizeof(HardwareVertex)); + std::memcpy(vbo, vertex_batch.data() + base_vertex, vertex_size); + vertex_buffer.Unmap(vertex_size); + glDrawArrays(GL_TRIANGLES, offset / sizeof(HardwareVertex), (GLsizei)vertices); + } } // Disable scissor test @@ -532,6 +810,8 @@ void RasterizerOpenGL::DrawTriangles() { res_cache.InvalidateRegion(boost::icl::first(interval), boost::icl::length(interval), depth_surface); } + + return succeeded; } void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) { @@ -1648,18 +1928,53 @@ void RasterizerOpenGL::SyncLightDistanceAttenuationScale(int light_index) { } } -void RasterizerOpenGL::UploadUniforms() { - if (!uniform_block_data.dirty) +void RasterizerOpenGL::UploadUniforms(bool accelerate_draw, bool use_gs) { + // glBindBufferRange below also changes the generic buffer binding point, so we sync the state + // first + state.draw.uniform_buffer = uniform_buffer.GetHandle(); + state.Apply(); + + bool sync_vs = accelerate_draw; + bool sync_gs = accelerate_draw && use_gs; + bool sync_fs = uniform_block_data.dirty; + + if (!sync_vs && !sync_gs && !sync_fs) return; - size_t uniform_size = uniform_size_aligned_fs; + size_t uniform_size = + uniform_size_aligned_vs + uniform_size_aligned_gs + uniform_size_aligned_fs; + size_t used_bytes = 0; u8* uniforms; GLintptr offset; - std::tie(uniforms, offset, std::ignore) = + bool invalidate; + std::tie(uniforms, offset, invalidate) = uniform_buffer.Map(uniform_size, uniform_buffer_alignment); - std::memcpy(uniforms, &uniform_block_data.data, sizeof(UniformData)); - uniform_buffer.Unmap(uniform_size); - glBindBufferRange(GL_UNIFORM_BUFFER, 0, uniform_buffer.GetHandle(), offset, - sizeof(UniformData)); - uniform_block_data.dirty = false; + + if (sync_vs) { + VSUniformData vs_uniforms; + vs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.vs, Pica::g_state.vs); + std::memcpy(uniforms + used_bytes, &vs_uniforms, sizeof(vs_uniforms)); + glBindBufferRange(GL_UNIFORM_BUFFER, static_cast(UniformBindings::VS), + uniform_buffer.GetHandle(), offset + used_bytes, sizeof(VSUniformData)); + used_bytes += uniform_size_aligned_vs; + } + + if (sync_gs) { + GSUniformData gs_uniforms; + gs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.gs, Pica::g_state.gs); + std::memcpy(uniforms + used_bytes, &gs_uniforms, sizeof(gs_uniforms)); + glBindBufferRange(GL_UNIFORM_BUFFER, static_cast(UniformBindings::GS), + uniform_buffer.GetHandle(), offset + used_bytes, sizeof(GSUniformData)); + used_bytes += uniform_size_aligned_gs; + } + + if (sync_fs || invalidate) { + std::memcpy(uniforms + used_bytes, &uniform_block_data.data, sizeof(UniformData)); + glBindBufferRange(GL_UNIFORM_BUFFER, static_cast(UniformBindings::Common), + uniform_buffer.GetHandle(), offset + used_bytes, sizeof(UniformData)); + uniform_block_data.dirty = false; + used_bytes += uniform_size_aligned_fs; + } + + uniform_buffer.Unmap(used_bytes); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 400bef388..547b73aae 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -50,6 +50,7 @@ public: bool AccelerateFill(const GPU::Regs::MemoryFillConfig& config) override; bool AccelerateDisplay(const GPU::Regs::FramebufferConfig& config, PAddr framebuffer_addr, u32 pixel_stride, ScreenInfo& screen_info) override; + bool AccelerateDrawBatch(bool is_indexed) override; private: struct SamplerInfo { @@ -73,6 +74,7 @@ private: /// Structure that the hardware rendered vertices are composed of struct HardwareVertex { + HardwareVertex() = default; HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion) { position[0] = v.pos.x.ToFloat32(); position[1] = v.pos.y.ToFloat32(); @@ -216,7 +218,32 @@ private: void SyncLightDistanceAttenuationScale(int light_index); /// Upload the uniform blocks to the uniform buffer object - void UploadUniforms(); + void UploadUniforms(bool accelerate_draw, bool use_gs); + + /// Generic draw function for DrawTriangles and AccelerateDrawBatch + bool Draw(bool accelerate, bool is_indexed); + + /// Internal implementation for AccelerateDrawBatch + bool AccelerateDrawBatchInternal(bool is_indexed, bool use_gs); + + struct VertexArrayInfo { + u32 vs_input_index_min; + u32 vs_input_index_max; + u32 vs_input_size; + }; + + /// Retrieve the range and the size of the input vertex + VertexArrayInfo AnalyzeVertexArray(bool is_indexed); + + /// Setup vertex array for AccelerateDrawBatch + void SetupVertexArray(u8* array_ptr, GLintptr buffer_offset, GLuint vs_input_index_min, + GLuint vs_input_index_max); + + /// Setup vertex shader for AccelerateDrawBatch + bool SetupVertexShader(); + + /// Setup geometry shader for AccelerateDrawBatch + bool SetupGeometryShader(); OpenGLState state; @@ -242,14 +269,21 @@ private: // They shall be big enough for about one frame. static constexpr size_t VERTEX_BUFFER_SIZE = 32 * 1024 * 1024; + static constexpr size_t INDEX_BUFFER_SIZE = 1 * 1024 * 1024; static constexpr size_t UNIFORM_BUFFER_SIZE = 2 * 1024 * 1024; + OGLVertexArray sw_vao; // VAO for software shader draw + OGLVertexArray hw_vao; // VAO for hardware shader / accelerate draw + std::array hw_vao_enabled_attributes{}; + std::array texture_samplers; - OGLVertexArray vertex_array; OGLStreamBuffer vertex_buffer; OGLStreamBuffer uniform_buffer; + OGLStreamBuffer index_buffer; OGLFramebuffer framebuffer; GLint uniform_buffer_alignment; + size_t uniform_size_aligned_vs; + size_t uniform_size_aligned_gs; size_t uniform_size_aligned_fs; SamplerInfo texture_cube_sampler; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 16f4556c8..17c1b7e92 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -18,6 +18,7 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_gen.h" #include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/video_core.h" using Pica::FramebufferRegs; using Pica::LightingRegs; @@ -226,7 +227,7 @@ void PicaShaderConfigCommon::Init(const Pica::ShaderRegs& regs, Pica::Shader::Sh program_hash = setup.GetProgramCodeHash(); swizzle_hash = setup.GetSwizzleDataHash(); main_offset = regs.main_offset; - sanitize_mul = false; // TODO (wwylele): stubbed now. Should sync with user settings + sanitize_mul = VideoCore::g_hw_shader_accurate_mul; num_outputs = 0; output_map.fill(16); diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index e1698443e..1957cfbcc 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -14,16 +14,26 @@ OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coh gl_buffer.Create(); glBindBuffer(gl_target, gl_buffer.handle); + GLsizeiptr allocate_size = size; + if (target == GL_ARRAY_BUFFER) { + // On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer + // read position is near the end and is an out-of-bound access to the vertex buffer. This is + // probably a bug in the driver and is related to the usage of vec3 attributes in the + // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the + // crash. + allocate_size *= 2; + } + if (GLAD_GL_ARB_buffer_storage) { persistent = true; coherent = prefer_coherent; GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); - glBufferStorage(gl_target, buffer_size, nullptr, flags); + glBufferStorage(gl_target, allocate_size, nullptr, flags); mapped_ptr = static_cast(glMapBufferRange( gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT))); } else { - glBufferData(gl_target, buffer_size, nullptr, GL_STREAM_DRAW); + glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW); } } diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index f4b530604..9fc29bbad 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -19,7 +19,9 @@ std::unique_ptr g_renderer; ///< Renderer plugin std::atomic g_hw_renderer_enabled; std::atomic g_shader_jit_enabled; -std::atomic g_vsync_enabled; +std::atomic g_hw_shader_enabled; +std::atomic g_hw_shader_accurate_gs; +std::atomic g_hw_shader_accurate_mul; /// Initialize the video core bool Init(EmuWindow* emu_window) { diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index 69c672f29..7ede71494 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -22,6 +22,9 @@ extern EmuWindow* g_emu_window; ///< Emu window // qt ui) extern std::atomic g_hw_renderer_enabled; extern std::atomic g_shader_jit_enabled; +extern std::atomic g_hw_shader_enabled; +extern std::atomic g_hw_shader_accurate_gs; +extern std::atomic g_hw_shader_accurate_mul; /// Start the video core void Start();