Optimize AttributeBuffer to OutputVertex conversion (#3283)

Optimize AttributeBuffer to OutputVertex conversion

First I unrolled the inner loop, then I pushed semantics validation
outside of the hotloop.

I also added overflow slots to avoid conditional branches.

Super Mario 3D Land's intro runs at almost full speed when compiled with
Clang, and theres a noticible speed increase in MSVC. GCC hasn't been
tested but I'm confident in its ability to optimize this code.
This commit is contained in:
Dwayne Slater
2018-01-02 18:32:33 -05:00
committed by Yuri Kunde Schlesner
parent 3f7f2b42c0
commit 41929371dc
4 changed files with 34 additions and 18 deletions

View File

@@ -221,6 +221,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
MICROPROFILE_SCOPE(GPU_Drawing);
immediate_attribute_id = 0;
Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
auto* shader_engine = Shader::GetEngine();
shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
@@ -289,6 +291,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
// Later, these can be compiled and cached.
const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
VertexLoader loader(regs.pipeline);
Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
// Load vertices
bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));