71#define DEBUG_TYPE "si-load-store-opt"
79 S_BUFFER_LOAD_SGPR_IMM,
98 unsigned char NumVAddrs = 0;
101 bool SOffset =
false;
109const unsigned MaxAddressRegs = 12 + 1 + 1;
111class SILoadStoreOptimizer {
120 InstClassEnum InstClass;
124 int AddrIdx[MaxAddressRegs];
126 unsigned NumAddresses;
129 bool hasSameBaseAddress(
const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
134 for (
unsigned i = 0; i < NumAddresses; i++) {
137 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
156 for (
unsigned i = 0; i < NumAddresses; ++i) {
165 if (!AddrOp->
isReg())
171 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
190 struct BaseRegisters {
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
197 bool UseV64Pattern =
false;
219 static bool dmasksCanBeCombined(
const CombineInfo &CI,
221 const CombineInfo &Paired);
222 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
223 CombineInfo &Paired,
bool Modify =
false);
224 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
225 const CombineInfo &Paired);
226 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
227 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
228 const CombineInfo &Paired);
230 getTargetRegisterClass(
const CombineInfo &CI,
231 const CombineInfo &Paired)
const;
234 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
236 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
240 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
244 unsigned read2Opcode(
unsigned EltSize)
const;
245 unsigned read2ST64Opcode(
unsigned EltSize)
const;
247 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
250 unsigned write2Opcode(
unsigned EltSize)
const;
251 unsigned write2ST64Opcode(
unsigned EltSize)
const;
252 unsigned getWrite2Opcode(
const CombineInfo &CI)
const;
255 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
258 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
261 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
264 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
267 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
270 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
273 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
276 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
279 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
283 int32_t NewOffset)
const;
284 void updateAsyncLDSAddress(
MachineInstr &
MI, int32_t OffsetDiff)
const;
289 MemAddress &Addr)
const;
298 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
303 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
306 const CombineInfo &Paired);
308 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
309 const CombineInfo &Paired);
311 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
312 bool &OptimizeListAgain);
313 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
328 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
343 const unsigned Opc =
MI.getOpcode();
349 if (
TII.isImage(
MI)) {
351 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
359 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
360 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
361 case AMDGPU::S_LOAD_DWORD_IMM:
362 case AMDGPU::GLOBAL_LOAD_DWORD:
363 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
364 case AMDGPU::GLOBAL_STORE_DWORD:
365 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
366 case AMDGPU::FLAT_LOAD_DWORD:
367 case AMDGPU::FLAT_STORE_DWORD:
368 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
369 case AMDGPU::FLAT_STORE_DWORD_SADDR:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
375 case AMDGPU::S_LOAD_DWORDX2_IMM:
376 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
377 case AMDGPU::GLOBAL_LOAD_DWORDX2:
378 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
379 case AMDGPU::GLOBAL_STORE_DWORDX2:
380 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
381 case AMDGPU::FLAT_LOAD_DWORDX2:
382 case AMDGPU::FLAT_STORE_DWORDX2:
383 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
384 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
386 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
388 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
389 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
390 case AMDGPU::S_LOAD_DWORDX3_IMM:
391 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
392 case AMDGPU::GLOBAL_LOAD_DWORDX3:
393 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
394 case AMDGPU::GLOBAL_STORE_DWORDX3:
395 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
396 case AMDGPU::FLAT_LOAD_DWORDX3:
397 case AMDGPU::FLAT_STORE_DWORDX3:
398 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
399 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
401 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
403 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
404 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
405 case AMDGPU::S_LOAD_DWORDX4_IMM:
406 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
407 case AMDGPU::GLOBAL_LOAD_DWORDX4:
408 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
409 case AMDGPU::GLOBAL_STORE_DWORDX4:
410 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
411 case AMDGPU::FLAT_LOAD_DWORDX4:
412 case AMDGPU::FLAT_STORE_DWORDX4:
413 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
414 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
416 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
417 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
418 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
420 case AMDGPU::S_LOAD_DWORDX8_IMM:
421 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
423 case AMDGPU::DS_READ_B32:
424 case AMDGPU::DS_READ_B32_gfx9:
425 case AMDGPU::DS_WRITE_B32:
426 case AMDGPU::DS_WRITE_B32_gfx9:
428 case AMDGPU::DS_READ_B64:
429 case AMDGPU::DS_READ_B64_gfx9:
430 case AMDGPU::DS_WRITE_B64:
431 case AMDGPU::DS_WRITE_B64_gfx9:
446 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
447 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
448 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
449 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
450 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
451 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
452 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
453 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
457 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
458 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
459 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
460 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
461 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
463 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
464 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
465 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
466 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
467 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
468 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
469 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
470 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
474 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
475 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
476 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
477 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
478 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
491 if (
TII.get(
Opc).mayStore() || !
TII.get(
Opc).mayLoad() ||
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
511 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
512 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
513 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
514 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
515 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
519 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
520 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
521 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
522 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
523 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
524 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
525 return TBUFFER_STORE;
529 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
538 return S_BUFFER_LOAD_IMM;
539 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
542 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
543 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
544 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
545 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
546 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
547 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
548 return S_BUFFER_LOAD_SGPR_IMM;
549 case AMDGPU::S_LOAD_DWORD_IMM:
550 case AMDGPU::S_LOAD_DWORDX2_IMM:
551 case AMDGPU::S_LOAD_DWORDX3_IMM:
552 case AMDGPU::S_LOAD_DWORDX4_IMM:
553 case AMDGPU::S_LOAD_DWORDX8_IMM:
554 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
555 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
556 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
557 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
559 case AMDGPU::DS_READ_B32:
560 case AMDGPU::DS_READ_B32_gfx9:
561 case AMDGPU::DS_READ_B64:
562 case AMDGPU::DS_READ_B64_gfx9:
564 case AMDGPU::DS_WRITE_B32:
565 case AMDGPU::DS_WRITE_B32_gfx9:
566 case AMDGPU::DS_WRITE_B64:
567 case AMDGPU::DS_WRITE_B64_gfx9:
569 case AMDGPU::GLOBAL_LOAD_DWORD:
570 case AMDGPU::GLOBAL_LOAD_DWORDX2:
571 case AMDGPU::GLOBAL_LOAD_DWORDX3:
572 case AMDGPU::GLOBAL_LOAD_DWORDX4:
573 case AMDGPU::FLAT_LOAD_DWORD:
574 case AMDGPU::FLAT_LOAD_DWORDX2:
575 case AMDGPU::FLAT_LOAD_DWORDX3:
576 case AMDGPU::FLAT_LOAD_DWORDX4:
578 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
579 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
580 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
581 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
582 return GLOBAL_LOAD_SADDR;
583 case AMDGPU::GLOBAL_STORE_DWORD:
584 case AMDGPU::GLOBAL_STORE_DWORDX2:
585 case AMDGPU::GLOBAL_STORE_DWORDX3:
586 case AMDGPU::GLOBAL_STORE_DWORDX4:
587 case AMDGPU::FLAT_STORE_DWORD:
588 case AMDGPU::FLAT_STORE_DWORDX2:
589 case AMDGPU::FLAT_STORE_DWORDX3:
590 case AMDGPU::FLAT_STORE_DWORDX4:
592 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
593 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
594 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
595 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
596 return GLOBAL_STORE_SADDR;
597 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
598 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
599 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
600 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
601 return FLAT_LOAD_SADDR;
602 case AMDGPU::FLAT_STORE_DWORD_SADDR:
603 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
604 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
605 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
606 return FLAT_STORE_SADDR;
621 return Info->BaseOpcode;
626 case AMDGPU::DS_READ_B32:
627 case AMDGPU::DS_READ_B32_gfx9:
628 case AMDGPU::DS_READ_B64:
629 case AMDGPU::DS_READ_B64_gfx9:
630 case AMDGPU::DS_WRITE_B32:
631 case AMDGPU::DS_WRITE_B32_gfx9:
632 case AMDGPU::DS_WRITE_B64:
633 case AMDGPU::DS_WRITE_B64_gfx9:
635 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
644 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
645 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
648 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
649 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
650 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
651 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
652 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
653 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
654 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
655 case AMDGPU::S_LOAD_DWORD_IMM:
656 case AMDGPU::S_LOAD_DWORDX2_IMM:
657 case AMDGPU::S_LOAD_DWORDX3_IMM:
658 case AMDGPU::S_LOAD_DWORDX4_IMM:
659 case AMDGPU::S_LOAD_DWORDX8_IMM:
660 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
661 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
662 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
663 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
664 return AMDGPU::S_LOAD_DWORD_IMM;
665 case AMDGPU::GLOBAL_LOAD_DWORD:
666 case AMDGPU::GLOBAL_LOAD_DWORDX2:
667 case AMDGPU::GLOBAL_LOAD_DWORDX3:
668 case AMDGPU::GLOBAL_LOAD_DWORDX4:
669 case AMDGPU::FLAT_LOAD_DWORD:
670 case AMDGPU::FLAT_LOAD_DWORDX2:
671 case AMDGPU::FLAT_LOAD_DWORDX3:
672 case AMDGPU::FLAT_LOAD_DWORDX4:
673 return AMDGPU::FLAT_LOAD_DWORD;
674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
679 case AMDGPU::GLOBAL_STORE_DWORD:
680 case AMDGPU::GLOBAL_STORE_DWORDX2:
681 case AMDGPU::GLOBAL_STORE_DWORDX3:
682 case AMDGPU::GLOBAL_STORE_DWORDX4:
683 case AMDGPU::FLAT_STORE_DWORD:
684 case AMDGPU::FLAT_STORE_DWORDX2:
685 case AMDGPU::FLAT_STORE_DWORDX3:
686 case AMDGPU::FLAT_STORE_DWORDX4:
687 return AMDGPU::FLAT_STORE_DWORD;
688 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
689 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
690 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
691 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
692 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
693 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
694 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
695 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
696 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
697 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
698 case AMDGPU::FLAT_STORE_DWORD_SADDR:
699 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
700 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
701 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
702 return AMDGPU::FLAT_STORE_DWORD_SADDR;
713SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
714 const CombineInfo &Paired) {
715 assert(CI.InstClass == Paired.InstClass);
717 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
719 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
733 Result.SOffset =
true;
739 int VAddr0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0);
740 if (VAddr0Idx >= 0) {
741 AMDGPU::OpName RsrcName =
742 TII.isMIMG(
Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
743 int RsrcIdx = AMDGPU::getNamedOperandIdx(
Opc, RsrcName);
744 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
761 Result.SOffset =
true;
769 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
773 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
774 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
778 Result.SOffset =
true;
780 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
783 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
784 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
785 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
786 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
787 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
788 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
789 case AMDGPU::S_LOAD_DWORD_IMM:
790 case AMDGPU::S_LOAD_DWORDX2_IMM:
791 case AMDGPU::S_LOAD_DWORDX3_IMM:
792 case AMDGPU::S_LOAD_DWORDX4_IMM:
793 case AMDGPU::S_LOAD_DWORDX8_IMM:
794 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
795 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
796 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
797 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
800 case AMDGPU::DS_READ_B32:
801 case AMDGPU::DS_READ_B64:
802 case AMDGPU::DS_READ_B32_gfx9:
803 case AMDGPU::DS_READ_B64_gfx9:
804 case AMDGPU::DS_WRITE_B32:
805 case AMDGPU::DS_WRITE_B64:
806 case AMDGPU::DS_WRITE_B32_gfx9:
807 case AMDGPU::DS_WRITE_B64_gfx9:
810 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
811 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
812 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
813 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
814 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
815 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
816 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
817 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
818 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
819 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
820 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
821 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
822 case AMDGPU::FLAT_STORE_DWORD_SADDR:
823 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
824 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
825 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
828 case AMDGPU::GLOBAL_LOAD_DWORD:
829 case AMDGPU::GLOBAL_LOAD_DWORDX2:
830 case AMDGPU::GLOBAL_LOAD_DWORDX3:
831 case AMDGPU::GLOBAL_LOAD_DWORDX4:
832 case AMDGPU::GLOBAL_STORE_DWORD:
833 case AMDGPU::GLOBAL_STORE_DWORDX2:
834 case AMDGPU::GLOBAL_STORE_DWORDX3:
835 case AMDGPU::GLOBAL_STORE_DWORDX4:
836 case AMDGPU::FLAT_LOAD_DWORD:
837 case AMDGPU::FLAT_LOAD_DWORDX2:
838 case AMDGPU::FLAT_LOAD_DWORDX3:
839 case AMDGPU::FLAT_LOAD_DWORDX4:
840 case AMDGPU::FLAT_STORE_DWORD:
841 case AMDGPU::FLAT_STORE_DWORDX2:
842 case AMDGPU::FLAT_STORE_DWORDX3:
843 case AMDGPU::FLAT_STORE_DWORDX4:
850 const SILoadStoreOptimizer &LSO) {
852 unsigned Opc =
MI->getOpcode();
853 InstClass = getInstClass(
Opc, *LSO.TII);
855 if (InstClass == UNKNOWN)
858 DataRC = LSO.getDataRegClass(*
MI);
863 (
Opc == AMDGPU::DS_READ_B64 ||
Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
868 (
Opc == AMDGPU::DS_WRITE_B64 ||
Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
871 case S_BUFFER_LOAD_IMM:
872 case S_BUFFER_LOAD_SGPR_IMM:
881 if (InstClass == MIMG) {
886 int OffsetIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::offset);
887 Offset =
I->getOperand(OffsetIdx).getImm();
890 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
894 EltSize = Info->BitsPerComp / 8;
897 Width = getOpcodeWidth(*
I, *LSO.TII);
899 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
901 }
else if (InstClass != MIMG) {
905 AddressRegs Regs = getRegs(
Opc, *LSO.TII);
909 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
910 AddrIdx[NumAddresses++] =
911 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0) + J;
913 AddrIdx[NumAddresses++] =
914 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::addr);
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::sbase);
919 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
920 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
922 AddrIdx[NumAddresses++] =
923 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::soffset);
925 AddrIdx[NumAddresses++] =
926 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
928 AddrIdx[NumAddresses++] =
929 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
931 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
932 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
933 assert(NumAddresses <= MaxAddressRegs);
935 for (
unsigned J = 0; J < NumAddresses; J++)
936 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
942 "SI Load Store Optimizer",
false,
false)
947char SILoadStoreOptimizerLegacy::
ID = 0;
952 return new SILoadStoreOptimizerLegacy();
958 for (
const auto &
Op :
MI.operands()) {
968bool SILoadStoreOptimizer::canSwapInstructions(
969 const DenseSet<Register> &ARegDefs,
const DenseSet<Register> &ARegUses,
970 const MachineInstr &
A,
const MachineInstr &
B)
const {
971 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
972 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
974 for (
const auto &BOp :
B.operands()) {
977 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
979 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
988SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
989 const CombineInfo &Paired) {
990 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
991 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1005 MachineFunction *MF = CI.I->getMF();
1009bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
1010 const SIInstrInfo &
TII,
1011 const CombineInfo &Paired) {
1012 assert(CI.InstClass == MIMG);
1015 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1016 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1018 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1022 AMDGPU::OpName OperandsToMatch[] = {
1023 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1024 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1026 for (AMDGPU::OpName
op : OperandsToMatch) {
1027 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
op);
1028 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),
op) != Idx)
1031 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1036 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1037 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1043 if ((1u << AllowedBitsForMin) <= MinMask)
1050 unsigned ComponentCount,
1052 if (ComponentCount > 4)
1071 return NewFormatInfo->
Format;
1084bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1085 const GCNSubtarget &STI,
1086 CombineInfo &Paired,
1088 assert(CI.InstClass != MIMG);
1092 if (CI.Offset == Paired.Offset)
1096 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1099 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1101 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1103 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1115 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1116 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1117 NumCombinedComponents = 4;
1125 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1126 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1127 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1128 ElemIndex1 + Paired.Width != ElemIndex0)
1134 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1135 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1136 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1137 if (MinOff % RequiredAlign != 0)
1143 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1144 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1149 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1150 if (EltOffset0 + CI.Width != EltOffset1 &&
1151 EltOffset1 + Paired.Width != EltOffset0)
1157 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1158 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1164 if (CI.Width != Paired.Width &&
1165 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1173 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1176 CI.Offset = EltOffset0 / 64;
1177 Paired.Offset = EltOffset1 / 64;
1186 CI.Offset = EltOffset0;
1187 Paired.Offset = EltOffset1;
1193 uint32_t Min = std::min(EltOffset0, EltOffset1);
1194 uint32_t
Max = std::max(EltOffset0, EltOffset1);
1197 if (((Max - Min) & ~Mask) == 0) {
1206 CI.BaseOff = BaseOff * CI.EltSize;
1207 CI.Offset = (EltOffset0 - BaseOff) / 64;
1208 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1220 CI.BaseOff = BaseOff * CI.EltSize;
1221 CI.Offset = EltOffset0 - BaseOff;
1222 Paired.Offset = EltOffset1 - BaseOff;
1230bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1231 const CombineInfo &CI,
1232 const CombineInfo &Paired) {
1233 const unsigned Width = (CI.Width + Paired.Width);
1234 switch (CI.InstClass) {
1237 case S_BUFFER_LOAD_IMM:
1238 case S_BUFFER_LOAD_SGPR_IMM:
1248 return STM.hasScalarDwordx3Loads();
1253const TargetRegisterClass *
1254SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1255 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1256 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1258 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1259 return TRI->getRegClassForReg(*MRI, Src->getReg());
1261 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1262 return TRI->getRegClassForReg(*MRI, Src->getReg());
1264 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1265 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1267 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1268 return TRI->getRegClassForReg(*MRI, Src->getReg());
1275SILoadStoreOptimizer::CombineInfo *
1276SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1277 CombineInfo &Paired) {
1280 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1282 assert(CI.InstClass == Paired.InstClass);
1284 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1285 getInstSubclass(Paired.I->getOpcode(), *
TII))
1290 if (CI.InstClass == MIMG) {
1291 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1294 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1298 DenseSet<Register> RegDefs;
1299 DenseSet<Register> RegUses;
1301 if (CI.I->mayLoad()) {
1305 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1313 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1323 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1324 offsetsCanBeCombined(CI, *STM, Paired,
true);
1326 if (CI.InstClass == DS_WRITE) {
1334 const MachineOperand *Data0 =
1335 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1336 const MachineOperand *Data1 =
1337 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1339 const MCInstrDesc &Write2Opc =
TII->get(getWrite2Opcode(CI));
1340 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1341 AMDGPU::OpName::data0);
1342 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1343 AMDGPU::OpName::data1);
1345 const TargetRegisterClass *DataRC0 =
TII->getRegClass(Write2Opc, Data0Idx);
1347 const TargetRegisterClass *DataRC1 =
TII->getRegClass(Write2Opc, Data1Idx);
1349 if (
unsigned SubReg = Data0->
getSubReg()) {
1354 if (
unsigned SubReg = Data1->
getSubReg()) {
1372void SILoadStoreOptimizer::copyToDestRegs(
1373 CombineInfo &CI, CombineInfo &Paired,
1375 AMDGPU::OpName OpName,
Register DestReg)
const {
1376 MachineBasicBlock *
MBB = CI.I->getParent();
1378 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1381 const MCInstrDesc &CopyDesc =
TII->get(TargetOpcode::COPY);
1382 auto *Dest0 =
TII->getNamedOperand(*CI.I, OpName);
1383 auto *Dest1 =
TII->getNamedOperand(*Paired.I, OpName);
1388 Dest0->setIsEarlyClobber(
false);
1389 Dest1->setIsEarlyClobber(
false);
1393 .
addReg(DestReg, {}, SubRegIdx0);
1396 .
addReg(DestReg, RegState::Kill, SubRegIdx1);
1402SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1405 AMDGPU::OpName OpName)
const {
1406 MachineBasicBlock *
MBB = CI.I->getParent();
1408 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1411 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1414 const auto *Src0 =
TII->getNamedOperand(*CI.I, OpName);
1415 const auto *Src1 =
TII->getNamedOperand(*Paired.I, OpName);
1417 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1426unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1428 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1429 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1432unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1434 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1436 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1437 : AMDGPU::DS_READ2ST64_B64_gfx9;
1441SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1443 MachineBasicBlock *
MBB = CI.I->getParent();
1447 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1449 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1450 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1452 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1455 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1457 const MCInstrDesc &Read2Desc =
TII->get(
Opc);
1459 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1466 unsigned BaseSubReg = AddrReg->getSubReg();
1474 BaseRegFlags = RegState::Kill;
1476 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1478 .addReg(AddrReg->getReg(), {}, BaseSubReg)
1483 MachineInstrBuilder Read2 =
1485 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1491 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdst, DestReg);
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1500unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1502 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1503 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1504 : AMDGPU::DS_WRITE2_B64_gfx9;
1507unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1509 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1510 : AMDGPU::DS_WRITE2ST64_B64;
1512 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1513 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1516unsigned SILoadStoreOptimizer::getWrite2Opcode(
const CombineInfo &CI)
const {
1517 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1521 CombineInfo &CI, CombineInfo &Paired,
1523 MachineBasicBlock *
MBB = CI.I->getParent();
1527 const MachineOperand *AddrReg =
1528 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1529 const MachineOperand *Data0 =
1530 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1531 const MachineOperand *Data1 =
1532 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1534 unsigned NewOffset0 = CI.Offset;
1535 unsigned NewOffset1 = Paired.Offset;
1536 unsigned Opc = getWrite2Opcode(CI);
1538 if (NewOffset0 > NewOffset1) {
1545 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1547 const MCInstrDesc &Write2Desc =
TII->get(
Opc);
1552 unsigned BaseSubReg = AddrReg->
getSubReg();
1560 BaseRegFlags = RegState::Kill;
1562 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1564 .addReg(AddrReg->
getReg(), {}, BaseSubReg)
1569 MachineInstrBuilder Write2 =
1571 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1582 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1587SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1589 MachineBasicBlock *
MBB = CI.I->getParent();
1593 const unsigned Opcode = getNewOpcode(CI, Paired);
1595 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1598 unsigned MergedDMask = CI.DMask | Paired.DMask;
1600 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1602 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1603 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1605 MIB.addImm(MergedDMask);
1607 MIB.add((*CI.I).getOperand(
I));
1613 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1615 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1617 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1619 CI.I->eraseFromParent();
1620 Paired.I->eraseFromParent();
1625 CombineInfo &CI, CombineInfo &Paired,
1627 MachineBasicBlock *
MBB = CI.I->getParent();
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1633 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1636 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1643 MachineInstrBuilder
New =
1645 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1646 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1647 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1648 New.addImm(MergedOffset);
1649 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1651 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::sdst, DestReg);
1653 CI.I->eraseFromParent();
1654 Paired.I->eraseFromParent();
1659 CombineInfo &CI, CombineInfo &Paired,
1661 MachineBasicBlock *
MBB = CI.I->getParent();
1666 const unsigned Opcode = getNewOpcode(CI, Paired);
1668 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1672 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1674 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1676 AddressRegs Regs = getRegs(Opcode, *
TII);
1679 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1684 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1687 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1688 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1689 .addImm(MergedOffset)
1692 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1694 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1696 CI.I->eraseFromParent();
1697 Paired.I->eraseFromParent();
1702 CombineInfo &CI, CombineInfo &Paired,
1704 MachineBasicBlock *
MBB = CI.I->getParent();
1709 const unsigned Opcode = getNewOpcode(CI, Paired);
1711 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1715 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1717 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1719 AddressRegs Regs = getRegs(Opcode, *
TII);
1722 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1727 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1728 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1729 NumCombinedComponents = 4;
1730 unsigned JoinedFormat =
1736 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1739 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1740 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1741 .addImm(MergedOffset)
1742 .addImm(JoinedFormat)
1745 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1747 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1749 CI.I->eraseFromParent();
1750 Paired.I->eraseFromParent();
1755 CombineInfo &CI, CombineInfo &Paired,
1757 MachineBasicBlock *
MBB = CI.I->getParent();
1761 const unsigned Opcode = getNewOpcode(CI, Paired);
1764 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
1767 .
addReg(SrcReg, RegState::Kill);
1769 AddressRegs Regs = getRegs(Opcode, *
TII);
1772 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1777 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1778 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1779 NumCombinedComponents = 4;
1780 unsigned JoinedFormat =
1786 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1789 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1790 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1791 .addImm(std::min(CI.Offset, Paired.Offset))
1792 .addImm(JoinedFormat)
1795 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1797 CI.I->eraseFromParent();
1798 Paired.I->eraseFromParent();
1803 CombineInfo &CI, CombineInfo &Paired,
1805 MachineBasicBlock *
MBB = CI.I->getParent();
1810 const unsigned Opcode = getNewOpcode(CI, Paired);
1812 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1815 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1817 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1821 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1822 .addImm(std::min(CI.Offset, Paired.Offset))
1824 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1826 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdst, DestReg);
1828 CI.I->eraseFromParent();
1829 Paired.I->eraseFromParent();
1834 CombineInfo &CI, CombineInfo &Paired,
1836 MachineBasicBlock *
MBB = CI.I->getParent();
1841 const unsigned Opcode = getNewOpcode(CI, Paired);
1844 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
1847 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1848 .
addReg(SrcReg, RegState::Kill);
1850 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1854 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1856 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1858 CI.I->eraseFromParent();
1859 Paired.I->eraseFromParent();
1868 (MMOs.
size() != 1 || MMOs[0]->
getAlign().value() < Width * 4);
1871unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1872 const CombineInfo &Paired) {
1873 const unsigned Width = CI.Width + Paired.Width;
1875 switch (getCommonInstClass(CI, Paired)) {
1877 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1888 case S_BUFFER_LOAD_IMM: {
1891 bool NeedsConstrainedOpc =
1897 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1898 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1900 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1901 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1903 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1904 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1906 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1907 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1910 case S_BUFFER_LOAD_SGPR_IMM: {
1913 bool NeedsConstrainedOpc =
1919 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1920 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1922 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1923 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1925 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1926 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1928 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1929 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1935 bool NeedsConstrainedOpc =
1941 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1942 : AMDGPU::S_LOAD_DWORDX2_IMM;
1944 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1945 : AMDGPU::S_LOAD_DWORDX3_IMM;
1947 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1948 : AMDGPU::S_LOAD_DWORDX4_IMM;
1950 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1951 : AMDGPU::S_LOAD_DWORDX8_IMM;
1959 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1961 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1963 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1965 case GLOBAL_LOAD_SADDR:
1970 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1972 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1974 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1981 return AMDGPU::GLOBAL_STORE_DWORDX2;
1983 return AMDGPU::GLOBAL_STORE_DWORDX3;
1985 return AMDGPU::GLOBAL_STORE_DWORDX4;
1987 case GLOBAL_STORE_SADDR:
1992 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1994 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1996 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
2003 return AMDGPU::FLAT_LOAD_DWORDX2;
2005 return AMDGPU::FLAT_LOAD_DWORDX3;
2007 return AMDGPU::FLAT_LOAD_DWORDX4;
2014 return AMDGPU::FLAT_STORE_DWORDX2;
2016 return AMDGPU::FLAT_STORE_DWORDX3;
2018 return AMDGPU::FLAT_STORE_DWORDX4;
2020 case FLAT_LOAD_SADDR:
2025 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2027 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2029 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2031 case FLAT_STORE_SADDR:
2036 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2038 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2040 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2049std::pair<unsigned, unsigned>
2050SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
2051 const CombineInfo &Paired) {
2052 assert((CI.InstClass != MIMG ||
2054 CI.Width + Paired.Width)) &&
2060 static const unsigned Idxs[5][4] = {
2061 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2062 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2063 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2064 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2065 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2068 assert(CI.Width >= 1 && CI.Width <= 4);
2069 assert(Paired.Width >= 1 && Paired.Width <= 4);
2072 Idx1 = Idxs[0][Paired.Width - 1];
2073 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2075 Idx0 = Idxs[0][CI.Width - 1];
2076 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2079 return {Idx0, Idx1};
2082const TargetRegisterClass *
2083SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
2084 const CombineInfo &Paired)
const {
2085 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2086 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2087 switch (CI.Width + Paired.Width) {
2091 return &AMDGPU::SReg_64_XEXECRegClass;
2093 return &AMDGPU::SGPR_96RegClass;
2095 return &AMDGPU::SGPR_128RegClass;
2097 return &AMDGPU::SGPR_256RegClass;
2099 return &AMDGPU::SGPR_512RegClass;
2105 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2106 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2112 CombineInfo &CI, CombineInfo &Paired,
2114 MachineBasicBlock *
MBB = CI.I->getParent();
2118 const unsigned Opcode = getNewOpcode(CI, Paired);
2121 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
2124 .
addReg(SrcReg, RegState::Kill);
2126 AddressRegs Regs = getRegs(Opcode, *
TII);
2129 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2135 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2138 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2139 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2140 .addImm(std::min(CI.Offset, Paired.Offset))
2143 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2145 CI.I->eraseFromParent();
2146 Paired.I->eraseFromParent();
2151SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &
MI)
const {
2152 APInt
V(32, Val,
true);
2153 if (
TII->isInlineConstant(V))
2158 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
2159 TII->get(AMDGPU::S_MOV_B32),
Reg)
2167Register SILoadStoreOptimizer::computeBase(MachineInstr &
MI,
2168 const MemAddress &Addr)
const {
2169 MachineBasicBlock *
MBB =
MI.getParent();
2176 if (Addr.Base.UseV64Pattern) {
2178 TII->getRegClass(
TII->get(AMDGPU::V_ADD_U64_e64), 0));
2182 MachineInstr *MovOffset =
2186 MachineInstr *
Add64 =
2189 .
addReg(OffsetReg, RegState::Kill)
2200 assert((
TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2201 Addr.Base.LoSubReg) &&
2202 "Expected 32-bit Base-Register-Low!!");
2204 assert((
TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2205 Addr.Base.HiSubReg) &&
2206 "Expected 32-bit Base-Register-Hi!!");
2208 MachineOperand OffsetLo = createRegOrImm(
static_cast<int32_t
>(Addr.Offset),
MI);
2209 MachineOperand OffsetHi =
2210 createRegOrImm(
static_cast<int32_t
>(Addr.Offset >> 32),
MI);
2212 const auto *CarryRC =
TRI->getWaveMaskRegClass();
2218 MachineInstr *LoHalf =
2220 .
addReg(CarryReg, RegState::Define)
2221 .
addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)
2225 MachineInstr *HiHalf =
2227 .
addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2228 .
addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg)
2230 .
addReg(CarryReg, RegState::Kill)
2234 MachineInstr *FullBase =
2245 dbgs() <<
" " << *HiHalf <<
"\n";
2246 dbgs() <<
" " << *FullBase <<
"\n\n";);
2252void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &
MI,
2254 int32_t NewOffset)
const {
2255 auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2256 Base->setReg(NewBase);
2257 Base->setIsKill(
false);
2258 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2264bool SILoadStoreOptimizer::processBaseWithConstOffset64(
2265 MachineInstr *AddDef,
const MachineOperand &
Base, MemAddress &Addr)
const {
2269 MachineOperand *Src0 =
TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);
2270 MachineOperand *Src1 =
TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);
2272 const MachineOperand *BaseOp =
nullptr;
2274 auto Offset =
TII->getImmOrMaterializedImm(*Src1);
2285 Addr.Base.LoReg = BaseOp->
getReg();
2286 Addr.Base.UseV64Pattern =
true;
2304void SILoadStoreOptimizer::processBaseWithConstOffset(
const MachineOperand &
Base,
2305 MemAddress &Addr)
const {
2314 if (
Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
2315 if (processBaseWithConstOffset64(Def,
Base, Addr))
2320 if (
Def->getOpcode() != AMDGPU::REG_SEQUENCE ||
Def->getNumOperands() != 5)
2323 MachineOperand BaseLo =
Def->getOperand(1);
2324 MachineOperand BaseHi =
Def->getOperand(3);
2331 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2332 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2335 MachineOperand *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2336 MachineOperand *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2338 auto Offset0P =
TII->getImmOrMaterializedImm(*Src0);
2342 if (!(Offset0P =
TII->getImmOrMaterializedImm(*Src1)))
2347 if (!BaseLo.
isReg())
2350 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2351 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2359 uint64_t Offset1 = Src1->
getImm();
2362 if (!BaseHi.
isReg())
2365 Addr.Base.LoReg = BaseLo.
getReg();
2366 Addr.Base.HiReg = BaseHi.
getReg();
2367 Addr.Base.LoSubReg = BaseLo.
getSubReg();
2368 Addr.Base.HiSubReg = BaseHi.
getSubReg();
2369 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2376void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &
MI,
2377 int32_t OffsetDiff)
const {
2378 if (!
TII->usesASYNC_CNT(
MI) || OffsetDiff == 0)
2381 MachineOperand *LDSAddr =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
2383 LDSAddr =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata);
2388 MachineBasicBlock &
MBB = *
MI.getParent();
2398bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2400 MemInfoMap &Visited,
2401 SmallPtrSet<MachineInstr *, 4> &
AnchorList)
const {
2416 bool AllowNegativeOffset =
2417 TII->allowNegativeFlatOffset(FlatVariant) && !
TII->usesASYNC_CNT(
MI);
2421 bool IsOffsetU16 =
TII->usesASYNC_CNT(
MI);
2428 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2434 MachineOperand &
Base = *
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2435 auto [It,
Inserted] = Visited.try_emplace(&
MI);
2438 processBaseWithConstOffset(
Base, MAddr);
2443 if (MAddr.Offset == 0) {
2444 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2445 " constant offsets that can be promoted.\n";);
2451 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2478 MachineInstr *AnchorInst =
nullptr;
2479 MemAddress AnchorAddr;
2480 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2482 bool MIIsAnchor =
false;
2484 MachineBasicBlock *
MBB =
MI.getParent();
2491 MachineInstr &MINext = *
MBBI;
2495 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2498 const MachineOperand &BaseNext =
2499 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2500 MemAddress MAddrNext;
2501 auto [It,
Inserted] = Visited.try_emplace(&MINext);
2503 processBaseWithConstOffset(BaseNext, MAddrNext);
2504 It->second = MAddrNext;
2506 MAddrNext = It->second;
2508 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2509 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2510 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2511 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2514 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2516 if (AllowNegativeOffset) {
2517 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2518 TargetLoweringBase::AddrMode AM;
2522 (uint32_t)std::abs(Dist) > MaxDist) {
2523 MaxDist = std::abs(Dist);
2525 AnchorAddr = MAddrNext;
2526 AnchorInst = &MINext;
2534 if (!AllowNegativeOffset && !InstsWCommonBase.
empty()) {
2535 for (
auto &[Inst,
Offset] : InstsWCommonBase) {
2536 int64_t Dist = MAddr.Offset -
Offset;
2537 TargetLoweringBase::AddrMode AM;
2542 (!AnchorInst ||
Offset < AnchorAddr.Offset)) {
2543 AnchorAddr = Visited[Inst];
2552 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2553 AnchorInst->
dump());
2555 << AnchorAddr.Offset <<
"\n\n");
2560 int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
2561 updateBaseAndOffset(
MI,
Base, OffsetDiff);
2562 updateAsyncLDSAddress(
MI, OffsetDiff);
2565 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2566 TargetLoweringBase::AddrMode AM;
2568 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2571 (AllowNegativeOffset || AM.
BaseOffs >= 0) &&
2575 int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
2576 updateBaseAndOffset(*OtherMI,
Base, OtherOffsetDiff);
2577 updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);
2586 LLVM_DEBUG(
dbgs() <<
" MI is anchor (smallest offset); promoting "
2587 "candidates relative to MI's base.\n");
2590 bool AnyPromoted =
false;
2592 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2593 int64_t Dist = OtherOffset - MAddr.Offset;
2594 TargetLoweringBase::AddrMode AM;
2601 updateBaseAndOffset(*OtherMI,
Base, Dist);
2602 updateAsyncLDSAddress(*OtherMI, Dist);
2609 TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr)->setIsKill(
false);
2618void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2619 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2620 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2621 if (AddrList.front().InstClass == CI.InstClass &&
2622 AddrList.front().hasSameBaseAddress(CI)) {
2623 AddrList.emplace_back(CI);
2629 MergeableInsts.emplace_back(1, CI);
2632std::pair<MachineBasicBlock::iterator, bool>
2633SILoadStoreOptimizer::collectMergeableInsts(
2635 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &
AnchorList,
2636 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2642 for (; BlockI != End; ++BlockI) {
2643 MachineInstr &
MI = *BlockI;
2647 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2652 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2660 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2661 if (InstClass == UNKNOWN)
2666 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::swz);
2667 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2670 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2671 const MachineOperand *Fmt =
2672 TII->getNamedOperand(
MI, AMDGPU::OpName::format);
2680 CI.setMI(
MI, *
this);
2683 if (!CI.hasMergeableAddress(*MRI))
2698 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2699 E = MergeableInsts.end();
I !=
E;) {
2701 std::list<CombineInfo> &MergeList = *
I;
2702 if (MergeList.size() <= 1) {
2706 I = MergeableInsts.erase(
I);
2714 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2715 return A.Offset <
B.Offset;
2726bool SILoadStoreOptimizer::optimizeBlock(
2727 std::list<std::list<CombineInfo> > &MergeableInsts) {
2730 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2731 E = MergeableInsts.end();
I !=
E;) {
2732 std::list<CombineInfo> &MergeList = *
I;
2734 bool OptimizeListAgain =
false;
2735 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2739 I = MergeableInsts.erase(
I);
2747 if (!OptimizeListAgain) {
2748 I = MergeableInsts.erase(
I);
2751 OptimizeAgain =
true;
2757SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2758 std::list<CombineInfo> &MergeList,
2759 bool &OptimizeListAgain) {
2760 if (MergeList.empty())
2765 for (
auto I = MergeList.begin(),
Next = std::next(
I);
Next != MergeList.end();
2766 Next = std::next(
I)) {
2771 if ((*First).Order > (*Second).Order)
2773 CombineInfo &CI = *
First;
2774 CombineInfo &Paired = *Second;
2776 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2784 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2787 switch (CI.InstClass) {
2792 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2795 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2797 case S_BUFFER_LOAD_IMM:
2798 case S_BUFFER_LOAD_SGPR_IMM:
2800 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2801 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2804 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2805 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2808 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2809 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2812 NewMI = mergeImagePair(CI, Paired, Where->I);
2813 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2816 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2817 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2820 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2821 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2824 case FLAT_LOAD_SADDR:
2826 case GLOBAL_LOAD_SADDR:
2827 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2828 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2831 case FLAT_STORE_SADDR:
2833 case GLOBAL_STORE_SADDR:
2834 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2835 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2838 CI.setMI(NewMI, *
this);
2839 CI.Order = Where->Order;
2843 MergeList.erase(Second);
2849bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2852 return SILoadStoreOptimizer(
2853 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2857bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2877 for (MachineBasicBlock &
MBB : MF) {
2881 bool CollectModified;
2882 std::list<std::list<CombineInfo>> MergeableInsts;
2886 std::tie(SectionEnd, CollectModified) =
2892 OptimizeAgain =
false;
2894 }
while (OptimizeAgain);
2916 bool Changed = SILoadStoreOptimizer(&
AA).run(MF);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static MaybeAlign getAlign(Value *Ptr)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)
When two instructions are combined into a single instruction we also need to combine the original loc...
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool isXNACKEnabled() const
const HexagonRegisterInfo & getRegisterInfo() const
TypeSize getValue() const
unsigned getOpcode() const
Return the opcode number for this descriptor.
An RAII based helper class to modify MachineFunctionProperties when running pass.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Add64
64 bits label addition
NodeAddr< DefNode * > Def
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
constexpr T maskLeadingOnes(unsigned N)
Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
char & SILoadStoreOptimizerLegacyID
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
constexpr unsigned BitWidth
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.