<TeXmacs|2.1.4>

<style|<tuple|beamer|vdh|wide-spacing|shadowed-frames|shadowed-titles|std-frame|dark-vador|framed-theorems|caas>>

<\body>
  <\hide-preamble>
    <assign|xfont|mathlarge=TeX Gyre Pagella,Fira>

    <assign|title-sub-bar-contents|<macro|body|>>

    \;

    <assign|rsC|<macro|<with|font|cal*|C>>>

    <assign|rsT|<macro|<with|font|cal*|T>>>

    <assign|marked|<macro|body|<tabular*|<tformat|<cwith|1|1|1|1|cell-background|dark
    grey>|<cwith|1|1|1|1|cell-bsep|0.25spc>|<cwith|1|1|1|1|cell-tsep|0.25spc>|<cwith|1|1|1|1|cell-lsep|1spc>|<cwith|1|1|1|1|cell-rsep|1spc>|<table|<row|<cell|<arg|body>>>>>>>>

    \;

    <assign|compressed|<\macro|body>
      <\surround||<right-flush>>
        <\with|par-par-sep|<times|0.4|<value|par-par-sep>>>
          <arg|body>
        </with>
      </surround>
    </macro>>

    \;

    <assign|render-specified-algorithm|<\macro|name|intro|body>
      <\padded*>
        <\with|par-first|0fn|par-par-sep|0.3em|item-hsep|<macro|1tab>|numbered-offset|<value|algorithm-numbered-offset>>
          <\surround||<vspace|0.5fn>>
            <algorithm-name|<arg|name>>

            <arg|intro>
          </surround>

          <\surround||<yes-indent*>>
            <\algorithm-indent>
              <arg|body>
            </algorithm-indent>
          </surround>
        </with>
      </padded*>
    </macro>>

    <assign|caas-input|<\macro|prompt|body>
      <\with|generic-prompt-color|<value|caas-prompt-color>|generic-input-color|<value|caas-input-color>>
        <generic-input|<with|font|Menlo|font-size|0.71|<arg|prompt>>|<with|font|Menlo|font-size|0.71|<arg|body>>>
      </with>
    </macro>>

    <assign|caas-output|<macro|body|<generic-output|<with|prog-language|verbatim|<arg|body>>>>>

    \;

    <assign|cpp|<macro|body|<with|xmode|prog|xprog-language|cpp|font-family|tt|font|Menlo|<small|<arg|body>>>>>

    <assign|framed-quoted|<\macro|body>
      <arg|body>
    </macro>>

    \;

    \;
  </hide-preamble>

  <screens|<\shown>
    <\with|par-mode|center>
      <tabular*|<tformat|<cwith|1|2|1|1|cell-bsep|2.5spc>|<cwith|6|6|1|1|cell-halign|r>|<cwith|6|6|1|1|cell-hyphen|t>|<cwith|1|1|1|1|cell-bsep|2spc>|<cwith|1|1|1|1|cell-tsep|4spc>|<cwith|2|2|1|1|cell-tsep|4spc>|<table|<row|<cell|<large|<strong|<with|color|orange|The
      JIL library (Justinline) for computations with
      SLPs>>>>>|<row|<cell|<tabular*|<tformat|<twith|table-valign|T>|<cwith|1|1|1|1|cell-bsep|1spc>|<cwith|1|1|1|1|cell-tsep|1spc>|<table|<row|<cell|>>>>>
      <large|<tabular*|<tformat|<twith|table-valign|T>|<cwith|3|3|1|1|cell-bsep|0spc>|<cwith|3|3|1|1|cell-tsep|0spc>|<table|<row|<cell|<space|2em><strong|Joris
      van der Hoeven><space|2em>>>|<row|<cell|<small|<with|color|pastel
      brown|Joint work with Albin <name|Ahlbck>, Ricardo <name|Buring>,
      Grgoire <name|Lecerf>>>>>|<row|<cell|<em|<small|CNRS, cole
      polytechnique, France>>>>>>>>>>|<row|<cell|>>|<row|<cell|<image|LOGO_ERC-FLAG_FP.png|0.45par|||>>>|<row|<cell|<space|0.9par>>>|<row|<\cell>
        <small|<strong|<with|color|orange|ODELIX thematic program,
        Palaiseau>>><htab|5mm><small|<strong|<with|color|orange|December 8,
        2025>>>
      </cell>>>>>
    </with>
  </shown>|<\hidden>
    \;

    \;

    \;

    <\center>
      <with|frame-recolor|#fa6|frame-hpadding|2tab|frame-vpadding|1tab|<\black-floral1-frame>
        <\compressed>
          <\center>
            <strong|<with|color|#aff|Part I>>

            <strong|Introduction and motivation>
          </center>
        </compressed>
      </black-floral1-frame>>
    </center>
  </hidden>|<\hidden>
    <tit|Example of an SLP (Straight Line Program)>

    <\overlays-phantoms|1|4>
      <\compact>
        <\equation*>
          <tabular|<tformat|<table|<row|<cell|in<around*|(|<with|color|#aff|x>,<with|color|#aff|y>|)>>>|<row|<cell|a\<assign\>x\<cdot\>x>>|<row|<cell|b\<assign\>7\<cdot\>y>>|<row|<cell|r\<assign\>a+b>>|<row|<cell|r\<assign\>r\<cdot\>r>>|<row|<cell|out<around*|(|<with|color|#fa6|a>,<with|color|#fa6|r>|)>>>>>><label|eq:intro-slp>
        </equation*>

        <with|gr-mode|<tuple|group-edit|edit-props>|gr-frame|<tuple|scale|1cm|<tuple|0.180017gw|0.499999gh>>|gr-geometry|<tuple|geometry|1par|0.304341par|center>|gr-grid|<tuple|empty>|gr-grid-old|<tuple|cartesian|<point|0|0>|1>|gr-edit-grid-aspect|<tuple|<tuple|axes|none>|<tuple|1|none>|<tuple|10|none>>|gr-edit-grid|<tuple|empty>|gr-edit-grid-old|<tuple|cartesian|<point|0|0>|1>|gr-grid-aspect-props|<tuple|<tuple|axes|#808080>|<tuple|1|#c0c0c0>|<tuple|10|darker
        grey>>|gr-grid-aspect|<tuple|<tuple|axes|#808080>|<tuple|1|#c0c0c0>|<tuple|10|darker
        grey>>|gr-text-at-halign|center|gr-text-at-repulse|1spc|gr-proviso|<show-from|1>|<graphics||<with|text-at-halign|center|text-at-repulse|0.5spc|proviso|<show-from|2>|<math-at|\<times\>|<point|-1.0|0.8>>>|<with|text-at-halign|center|text-at-repulse|0.5spc|proviso|<show-from|2>|<math-at|+|<point|-1.0|0.0>>>|<with|text-at-halign|center|text-at-repulse|0.5spc|proviso|<show-from|2>|<math-at|\<times\>|<point|-1.5|-0.8>>>|<with|text-at-halign|center|text-at-repulse|0.5spc|proviso|<show-from|2>|<math-at|<with|color|#aff|x>|<point|-1.5|-1.6>>>|<with|proviso|<show-from|2>|<spline|<point|-1|0.9>|<point|-1.2|0.5>|<point|-1.0|0.1>>>|<with|proviso|<show-from|2>|<spline|<point|-1|0.9>|<point|-0.8|0.5>|<point|-1.0|0.1>>>|<with|proviso|<show-from|2>|<spline|<point|-1.5|-0.7>|<point|-1.7|-1.1>|<point|-1.5|-1.5>>>|<with|proviso|<show-from|2>|<spline|<point|-1.5|-0.7>|<point|-1.3|-1.1>|<point|-1.5|-1.5>>>|<with|text-at-repulse|0.5spc|text-at-halign|center|proviso|<show-from|2>|<math-at|\<times\>|<point|-0.3|-0.8>>>|<with|text-at-repulse|0.5spc|text-at-halign|center|proviso|<show-from|2>|<math-at|<with|color|#aff|y>|<point|0.1|-1.6>>>|<with|text-at-halign|center|text-at-repulse|1spc|proviso|<show-from|2>|<math-at|7|<point|-0.7|-1.6>>>|<with|proviso|<show-from|2>|<line|<point|-1.00703|0.0783579>|<point|-1.5|-0.7>>>|<with|proviso|<show-from|2>|<line|<point|-1|0.1>|<point|-0.3|-0.7>>>|<with|proviso|<show-from|2>|<line|<point|-0.3|-0.7>|<point|-0.7|-1.5>>>|<with|proviso|<show-from|2>|<line|<point|-0.3|-0.7>|<point|0.1|-1.5>>>|<with|text-at-repulse|0.5spc|text-at-halign|center|proviso|<show-from|2>|<math-at|<with|color|#fa6|r>|<point|-1.0|1.5>>>|<with|proviso|<show-from|2>|<line|<point|-1.0|1.6>|<point|-1.0|0.9>>>|<with|text-at-repulse|0.5spc|text-at-halign|center|proviso|<show-from|2>|<math-at|<with|color|#fa6|a>|<point|-1.8|1.5>>>|<with|proviso|<show-from|2>|<spline|<point|-1.8|1.6>|<point|-1.8|0.5>|<point|-1.5|-0.7>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|\<times\>|<point|5.0|1.0>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|x>|<point|4.6|0.2>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|x>|<point|5.4|0.2>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|\<times\>|<point|7.5|1.0>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|+|<point|6.5|0.2>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|+|<point|8.5|0.2>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|\<times\>|<point|6|-0.6>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|\<times\>|<point|7|-0.6>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|\<times\>|<point|8|-0.6>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|\<times\>|<point|9|-0.6>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|x>|<point|5.7|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|x>|<point|6.3|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|y>|<point|7.3|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|7|<point|6.8|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|x>|<point|7.7|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|x>|<point|8.3|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|7|<point|8.8|-1.4>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<math-at|<with|color|#aff|y>|<point|9.3|-1.4>>>|<with|proviso|<show-from|4>|<line|<point|5|1.1>|<point|4.6|0.3>>>|<with|proviso|<show-from|4>|<line|<point|5|1.1>|<point|5.4|0.3>>>|<with|proviso|<show-from|4>|<line|<point|7.5|1.1>|<point|6.5|0.3>>>|<with|proviso|<show-from|4>|<line|<point|7.5|1.1>|<point|8.5|0.3>>>|<with|proviso|<show-from|4>|<line|<point|6.5|0.3>|<point|6.0|-0.5>>>|<with|proviso|<show-from|4>|<line|<point|6.5|0.3>|<point|7.0|-0.5>>>|<with|proviso|<show-from|4>|<line|<point|8.5|0.3>|<point|8.0|-0.5>>>|<with|proviso|<show-from|4>|<line|<point|8.5|0.3>|<point|9.0|-0.5>>>|<with|proviso|<show-from|4>|<line|<point|6|-0.5>|<point|5.7|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|6|-0.5>|<point|6.3|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|7|-0.5>|<point|6.8|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|7|-0.5>|<point|7.3|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|8|-0.5>|<point|7.7|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|8|-0.5>|<point|8.3|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|9|-0.5>|<point|8.8|-1.3>>>|<with|proviso|<show-from|4>|<line|<point|9|-0.5>|<point|9.3|-1.3>>>|<with|text-at-repulse|1spc|text-at-halign|center|proviso|<show-from|4>|<text-at|<small|<with|color|grey|Forest>>|<point|6.1|0.8>>>|<with|proviso|<show-from|3>|<math-at|<around*|(|<with|color|#aff|x><rsup|2>,<around*|(|<with|color|#aff|x><rsup|2>+7*<with|color|#aff|y>|)><rsup|2>|)>|<point|1.3|0.4>>>|<with|text-at-halign|center|proviso|<show-from|3>|<text-at|<small|<with|color|grey|Expression>>|<point|2.4|-0.1>>>|<with|text-at-halign|center|proviso|<show-from|2>|<text-at|<small|<with|color|grey|DAG>>|<point|-0.2|0.0>>>>>
      </compact>
    </overlays-phantoms>
  </hidden>|<\hidden>
    <tit|Why SLPs ?>

    <\compressed>
      <unroll|<\shown>
        <strong|Theory>

        <\itemize>
          <unroll|<\shown>
            <item>Algebraic complexity analysis
          </shown>|<\hidden*>
            <item>More general than it seems <math|\<longrightarrow\>>
            recording<vspace|0.33333fn>
          </hidden*>>
        </itemize>
      </shown>|<\hidden*>
        <strong|Practice>

        <\itemize>
          <unroll|<\shown>
            <item>Evaluate same SLP many times <math|\<longrightarrow\>>
            allows for massive parallelism
          </shown>|<\hidden*>
            <item>Trivial control <math|\<longrightarrow\>> easier to compile
            and optimize
          </hidden*>|<\hidden*>
            <item>Optimization techniques from both compilers and symbolic
            computation<vspace|0.3333fn>
          </hidden*>>
        </itemize>
      </hidden*>|<\hidden*>
        <strong|Applications>

        <\itemize>
          <unroll|<\shown>
            <item><overlays-phantoms|1|3|Homotopy
            continuation<overlay-from|2|, geometric
            resolution|><overlay-from|3|, gradient descent|>>
          </shown>|<\hidden*>
            <item><overlays-phantoms|1|3|Discrete Fourier
            Transforms<overlay-from|2|, multi-precision
            arithmetic|><overlay-from|3|, other\Pcodelets\Q|>>
          </hidden*>|<\hidden*>
            <item>Solving ODEs via relaxed power series computations
          </hidden*>>
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Some references>

    <\compressed>
      <unroll|<\shown>
        <strong|Theory>

        <\itemize>
          <item>Brgisser, Clausen, Shokrollahi:
          <newblock><with|font-shape|italic|Algebraic complexity theory>,
          1997<vspace|0.6666fn>
        </itemize>
      </shown>|<\hidden*>
        <strong|Software>

        <\itemize>
          <item>Daz, Kaltofen: <em|FoxBox>, 1998<vspace|0.6666fn>
        </itemize>
      </hidden*>|<\hidden*>
        <strong|Many <em|ad hoc> software for specific applications>

        <\itemize>
          <item>Geometric resolution <math|\<longrightarrow\>> Aldaz,
          Castao, Llovet, Martnez, Hgele, Bruno, Heintz, Matera, Giusti,
          Lecerf, Salvy, Durvye, <text-dots>, 2000\U2008

          <item>Codelets, DFTs <math|\<longrightarrow\>> Frigo-Johnson,
          <em|FFTW3>, 1999, Pschel et al., <em|Spiral>, 2005

          <item>Automatic differentiation <math|\<longrightarrow\>>
          <em|Autodiff>, <em|Torch.autograd>, <em|Juliadiff>,
          <text-dots><vspace|0.6666fn>
        </itemize>
      </hidden*>|<\hidden*>
        <strong|More references>

        <\itemize>
          <item>vdH, Lecerf: <em|Towards a library for straight-line
          programs>, AAECC, 2025
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Towards a dedicated library>

    <\compressed>
      <unroll|<\shown>
        <strong|<name|Justinline> (in <name|Mathemagix>)<htab|5mm>(2015\U2024)>
      </shown>|<\hidden*>
        <\itemize>
          <unroll|<\shown>
            <item>High level source code and interface.
          </shown>|<\hidden*>
            <item>Reasonably fast on-the-fly compilation of SLPs.
          </hidden*>|<\hidden*>
            <item>Resulting machine code is fast.
          </hidden*>|<\hidden*>
            <item>Suite of high level transformations and routines on
            SLPs.<vspace|0.6666fn>
          </hidden*>>
        </itemize>
      </hidden*>|<\hidden*>
        <strong|<name|JIL> (in C++)<htab|5mm>(2024\U*)>
      </hidden*>|<\hidden*>
        <\itemize>
          <unroll|<\shown>
            <item>Stand-alone C++ library <math|\<longrightarrow\>> lower
            threshold for developers and users.
          </shown>|<\hidden*>
            <item>Low level source code, planned bindings for
            <name|Mathemagix>, <name|Julia>, <name|Flint>, <text-dots>
          </hidden*>|<\hidden*>
            <item>Very fast on-the-fly compilation of SLPs.
          </hidden*>|<\hidden*>
            <item>More architectures: X86, ARM, <name|OpenCL>, <name|Cuda>,
            <name|Sass>.
          </hidden*>|<\hidden*>
            <item>Beyond SLPs<text-dots>?
          </hidden*>>
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Motivation and spoiler>

    <\center>
      <flat-size|<tabular|<tformat|<cwith|13|13|1|1|cell-bborder|0ln>|<cwith|1|-1|1|1|cell-lborder|0ln>|<cwith|13|13|2|2|cell-bborder|0ln>|<cwith|1|-1|2|2|cell-lborder|1ln>|<cwith|1|-1|1|1|cell-rborder|1ln>|<cwith|1|-1|2|2|cell-rborder|1ln>|<cwith|1|-1|3|3|cell-lborder|1ln>|<cwith|13|13|4|4|cell-bborder|0ln>|<cwith|1|-1|4|4|cell-lborder|0ln>|<cwith|1|-1|4|4|cell-rborder|1ln>|<cwith|1|-1|5|5|cell-lborder|1ln>|<cwith|1|-1|6|6|cell-lborder|0ln>|<cwith|1|-1|5|5|cell-rborder|0ln>|<cwith|1|-1|6|6|cell-rborder|1ln>|<cwith|2|2|1|-1|cell-bborder|1ln>|<cwith|2|2|1|1|cell-lborder|0ln>|<cwith|1|1|1|-1|cell-tborder|0ln>|<cwith|1|1|1|-1|cell-bborder|0ln>|<cwith|2|2|1|-1|cell-tborder|0ln>|<cwith|1|1|1|1|cell-lborder|0ln>|<cwith|1|1|2|2|cell-tborder|0ln>|<cwith|1|1|2|2|cell-bborder|0ln>|<cwith|2|2|2|2|cell-tborder|0ln>|<cwith|1|1|2|2|cell-lborder|0ln>|<cwith|1|1|1|1|cell-rborder|0ln>|<cwith|1|1|2|2|cell-rborder|0ln>|<cwith|1|1|3|3|cell-lborder|0ln>|<cwith|1|1|4|4|cell-tborder|0ln>|<cwith|1|1|4|4|cell-bborder|0ln>|<cwith|2|2|4|4|cell-tborder|0ln>|<cwith|1|1|4|4|cell-lborder|0ln>|<cwith|1|1|4|4|cell-rborder|0ln>|<cwith|1|1|5|5|cell-lborder|0ln>|<cwith|1|1|6|6|cell-tborder|0ln>|<cwith|1|1|6|6|cell-lborder|0ln>|<cwith|1|1|5|5|cell-rborder|0ln>|<cwith|1|1|6|6|cell-rborder|0ln>|<cwith|1|1|5|5|cell-row-span|1>|<cwith|1|1|5|5|cell-col-span|2>|<cwith|1|1|3|3|cell-halign|c>|<cwith|3|13|2|2|cell-halign|r>|<cwith|3|13|3|4|cell-halign|r>|<cwith|2|2|2|-1|cell-halign|c>|<cwith|3|13|5|6|cell-halign|r>|<cwith|1|1|5|5|cell-halign|c>|<cwith|10|10|1|-1|cell-tborder|0ln>|<cwith|9|9|1|-1|cell-bborder|0ln>|<cwith|10|10|1|-1|cell-bborder|1ln>|<cwith|11|11|1|-1|cell-tborder|1ln>|<cwith|10|10|1|1|cell-lborder|0ln>|<cwith|2|2|6|6|cell-tborder|0ln>|<cwith|1|1|5|6|cell-bborder|0ln>|<cwith|2|2|6|6|cell-lborder|0ln>|<cwith|2|2|5|5|cell-rborder|0ln>|<cwith|2|2|6|6|cell-rborder|0ln>|<cwith|3|8|3|3|color|#fa6>|<cwith|9|10|4|4|color|#fa6>|<cwith|3|10|6|6|color|#fa6>|<cwith|11|13|3|3|color|#fa6>|<cwith|11|12|5|5|color|#fa6>|<cwith|2|2|7|8|cell-bborder|1ln>|<cwith|1|1|7|8|cell-tborder|0ln>|<cwith|1|1|7|8|cell-bborder|0ln>|<cwith|2|2|7|8|cell-tborder|0ln>|<cwith|2|2|7|8|cell-halign|c>|<cwith|1|2|7|7|cell-lborder|1ln>|<cwith|2|2|7|8|cell-bborder|1ln>|<cwith|1|1|7|8|cell-tborder|0ln>|<cwith|1|1|7|8|cell-bborder|0ln>|<cwith|2|2|7|8|cell-tborder|0ln>|<cwith|1|1|7|7|cell-lborder|0ln>|<cwith|1|1|8|8|cell-bborder|0ln>|<cwith|2|2|8|8|cell-tborder|0ln>|<cwith|2|2|7|8|cell-halign|c>|<cwith|1|1|7|7|cell-halign|c>|<cwith|1|1|8|8|cell-tborder|0ln>|<cwith|1|2|8|8|cell-lborder|0ln>|<cwith|1|2|7|7|cell-rborder|0ln>|<cwith|1|2|8|8|cell-rborder|0ln>|<cwith|1|1|7|7|cell-row-span|1>|<cwith|1|1|7|7|cell-col-span|2>|<cwith|1|1|3|3|cell-row-span|1>|<cwith|1|1|3|3|cell-col-span|2>|<cwith|3|3|6|6|cell-tborder|1ln>|<cwith|2|2|6|6|cell-bborder|1ln>|<cwith|3|10|6|6|cell-lborder|0ln>|<cwith|3|10|5|5|cell-rborder|0ln>|<cwith|3|10|6|6|cell-rborder|1ln>|<cwith|3|10|7|7|cell-lborder|1ln>|<cwith|11|11|6|6|cell-tborder|1ln>|<cwith|10|10|6|6|cell-bborder|1ln>|<cwith|13|13|6|6|cell-bborder|0ln>|<cwith|11|13|6|6|cell-lborder|0ln>|<cwith|11|13|5|5|cell-rborder|0ln>|<cwith|11|13|6|6|cell-rborder|1ln>|<cwith|11|13|7|7|cell-lborder|1ln>|<cwith|10|10|7|8|cell-tborder|0ln>|<cwith|9|9|7|8|cell-bborder|0ln>|<cwith|10|10|7|8|cell-bborder|1ln>|<cwith|11|11|7|8|cell-tborder|1ln>|<cwith|3|13|7|7|cell-lborder|1ln>|<cwith|3|13|7|8|cell-halign|r>|<cwith|13|13|8|8|cell-bborder|0ln>|<cwith|3|13|8|8|cell-lborder|0ln>|<cwith|3|13|7|7|cell-rborder|0ln>|<cwith|3|13|8|8|cell-rborder|0ln>|<cwith|3|8|7|7|color|#fa6>|<cwith|9|10|8|8|color|#fa6>|<cwith|11|13|7|7|color|#fa6>|<cwith|1|1|1|-1|cell-tsep|0spc>|<table|<row|<cell|>|<cell|>|<cell|<name|Justinline>>|<cell|>|<cell|<name|Jil>>|<cell|>|<cell|GPU>|<cell|>>|<row|<cell|>|<cell|sols>|<cell|jit>|<cell|exe<rsub|8>>|<cell|jit>|<cell|exe<rsub|8>>|<cell|jit>|<cell|exe<rsub|4096>>>|<row|<cell|Katsura<rsub|6>>|<cell|64>|<cell|1.11
      s>|<cell|3 ms>|<cell|7.13 ms>|<cell|39.1 ms>|<cell|1.00 s>|<cell|68.9
      ms>>|<row|<cell|Katsura<rsub|8>>|<cell|256>|<cell|2.17 s>|<cell|10
      ms>|<cell|13.6 ms>|<cell|45.5 ms>|<cell|3.21 s>|<cell|0.42
      s>>|<row|<cell|Katsura<rsub|10>>|<cell|1024>|<cell|3.74 s>|<cell|37
      ms>|<cell|23.8 ms>|<cell|0.20 s>|<cell|7.58 s>|<cell|0.72
      s>>|<row|<cell|Katsura<rsub|12>>|<cell|4096>|<cell|6.16 s>|<cell|0.23
      s>|<cell|42.4 ms>|<cell|0.46 s>|<cell|15.3 s>|<cell|1.22
      s>>|<row|<cell|Katsura<rsub|14>>|<cell|16384>|<cell|9.34 s>|<cell|1.59
      s>|<cell|90.1 ms>|<cell|1.94 s>|<cell|29.4 s>|<cell|5.53
      s>>|<row|<cell|Katsura<rsub|16>>|<cell|65536>|<cell|13.4 s>|<cell|11.3
      s>|<cell|0.27 s>|<cell|11.9 s>|<cell|49.6 s>|<cell|29.1
      s>>|<row|<cell|Katsura<rsub|18>>|<cell|262144>|<cell|21.3 s>|<cell|145
      s>|<cell|0.95 s>|<cell|83.2 s>|<cell|82.3 s>|<cell|157
      s>>|<row|<cell|Katsura<rsub|20>>|<cell|1048576>|<cell|45.1 s>|<cell|824
      s>|<cell|3.95 s>|<cell|512 s>|<cell|124 s>|<cell|911
      s>>|<row|<cell|Posso<rsub|3,3>>|<cell|27>|<cell|0.35 s>|<cell|\<less\>1
      ms>|<cell|3.05 ms>|<cell|0.26 ms>|<cell|0.49 s>|<cell|5.98
      ms>>|<row|<cell|Posso<rsub|4,4>>|<cell|256>|<cell|1.50 s>|<cell|3
      ms>|<cell|11.9 ms>|<cell|1.5 ms>|<cell|1.76 s>|<cell|0.12
      s>>|<row|<cell|Posso<rsub|5,5>>|<cell|3125>|<cell|8.22 s>|<cell|78
      ms>|<cell|52.4 ms>|<cell|<with|color|#fa6|0.31 s>>|<cell|19.1
      s>|<cell|0.96 s>>>>>>

      <\flat-size>
        <\compact>
          <name|Intel> <name|Xeon> 3,2 GHz, AVX2 (<name|Justinline>), AVX512
          (JIL), 8 threads

          <name|Nvidia> <name|GeForce> RTX 4070 <name|Super> GPU,
          <name|OpenCL>, 4096 threads

          2<rsup|nd> order stepper (<name|Justinline>), 1<rsup|st> order
          stepper (JIL)
        </compact>
      </flat-size>
    </center>
  </hidden>|<\hidden>
    <tit|Modern hardware>

    <\compressed>
      <unroll|<\shown>
        <strong|CPU>

        <\itemize>
          <item>Out of order execution

          <item>Cache hierarchies

          <item>Up to 8 or 16 wide SIMD parallelism

          <item>Up to 100 cores
        </itemize>
      </shown>|<\hidden*>
        <strong|GPU>

        <\itemize>
          <item>Up to 10<space|0.4spc>000 cores

          <item>Often limited to 32 bits

          <item>Large transfer times with CPU

          <item>Hard to program and processors mostly undocumented
        </itemize>
      </hidden*>|<\hidden*>
        <strong|TPU>

        <\itemize>
          <item>Only matrix multiplication, but about 8 times faster than
          SIMD in theory
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    \;

    \;

    \;

    <\center>
      <with|frame-recolor|#fa6|frame-hpadding|2tab|frame-vpadding|1tab|<\black-floral1-frame>
        <\compressed>
          <\center>
            <strong|<with|color|#aff|Part II>>

            <strong|Representing SLPs in JIL>
          </center>
        </compressed>
      </black-floral1-frame>>
    </center>

    \;
  </hidden>|<\hidden>
    <tit|Low level representation of SLPs>

    \;

    <\overlays-phantoms|1|6>
      <\equation*>
        <tabular|<tformat|<table|<row|<cell|in<around*|(|<with|color|#aff|x>,<with|color|#aff|y>|)>>>|<row|<cell|a\<assign\>x\<cdot\>x>>|<row|<cell|b\<assign\>7\<cdot\>y>>|<row|<cell|r\<assign\>a+b>>|<row|<cell|r\<assign\>r\<cdot\>r>>|<row|<cell|out<around*|(|<with|color|#fa6|a>,<with|color|#fa6|r>|)>>>>>>\<nocomma\><space|3em><overlay-from|2|<tabular|<tformat|<table|<row|<cell|in<around*|(|<with|color|#aff|x<rsub|0>>,<with|color|#aff|x<rsub|1>>|)>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|0>\<cdot\>x<rsub|0>>>|<row|<cell|x<rsub|4>\<assign\>x<rsub|2>\<cdot\>x<rsub|1>>>|<row|<cell|x<rsub|5>\<assign\>x<rsub|3>+x<rsub|4>>>|<row|<cell|x<rsub|5>\<assign\>x<rsub|5>\<cdot\>x<rsub|5>>>|<row|<cell|out<around*|(|<with|color|#fa6|x<rsub|3>>,<with|color|#fa6|x<rsub|5>>|)>>>>>>|>\<nocomma\><space|3em><overlay-from|3|<tabular|<tformat|<cwith|1|-1|1|1|cell-halign|r>|<cwith|1|3|1|-1|cell-bsep|1.5spc>|<table|<row|<cell|<text|<with|color|#afa|in>>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|1>>>>>>>|<row|<cell|<text|<with|color|#afa|out>>>|<cell|<block|<tformat|<table|<row|<cell|3>|<cell|5>>>>>>>|<row|<cell|<text|<with|color|#afa|prg>>>|<cell|<block|<tformat|<table|<row|<cell|\<times\>>|<cell|3>|<cell|0>|<cell|0>|<cell|\<times\>>|<cell|4>|<cell|2>|<cell|1>>|<row|<cell|+>|<cell|5>|<cell|3>|<cell|4>|<cell|+>|<cell|5>|<cell|5>|<cell|5>>>>>>>|<row|<cell|<text|<with|color|#afa|data>>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|0>|<cell|7>|<cell|0>|<cell|0>|<cell|0>>>>>>>>>>|>
      </equation*>

      \;

      <\compressed>
        <overlay-from|4|<with|color|#afa|in>, <with|color|#afa|out>,
        <with|color|#afa|prg>: arrays of 32 bit integers|>

        <overlay-from|5|<math|\<Longrightarrow\>> operations <math|+>,
        <math|\<times\>>, <math|\<ldots\>> encoded as 32 bit integers|>

        <overlay-from|6|<with|color|#afa|data>: array of elements in some
        \Pscalar domain\Q <math|\<bbb-K\>>|>
      </compressed>
    </overlays-phantoms>
  </hidden>|<\hidden>
    <tit|Domains>

    <\compressed>
      <unroll|<\shown>
        <strong|Hardware domains>

        <\itemize>
          <unroll|<\shown>
            <item><math|\<bbb-Z\><rsub|8>,\<bbb-Z\><rsub|16>,\<bbb-Z\><rsub|32>,\<bbb-Z\><rsub|64>,\<bbb-N\><rsub|8>,\<bbb-N\><rsub|16>,\<bbb-N\><rsub|32>,\<bbb-N\><rsub|64>,\<bbb-R\><rsub|32>,\<bbb-R\><rsub|64>>
          </shown>|<\hidden*>
            <item>SIMD variants, <abbr|e.g.>
            <math|\<bbb-Z\><rsub|8><rsup|64>,\<bbb-Z\><rsub|16><rsup|32>,\<bbb-Z\><rsub|32><rsup|16>,\<bbb-Z\><rsub|64><rsup|8>,\<bbb-N\><rsub|8><rsup|64>,\<bbb-N\><rsub|16><rsup|32>,\<bbb-N\><rsub|32><rsup|16>,\<bbb-N\><rsub|64><rsup|8>,\<bbb-R\><rsub|32><rsup|16>,\<bbb-R\><rsub|64><rsup|8>><vspace|0.5fn>
          </hidden*>>
        </itemize>
      </shown>|<\hidden*>
        <strong|Software domains>

        <\itemize>
          <unroll|<\shown>
            <item><math|\<bbb-K\><around*|[|\<mathi\>|]>>,
            <math|Ball<around*|(|\<bbb-K\><rsub|cen>,\<bbb-K\><rsub|rad>|)>>,
            <math|\<ldots\>>
          </shown>|<\hidden*>
            <item><math|\<bbb-K\><rsup|n>>,
            <math|\<bbb-K\><rsup|r\<times\>c>>,
            <math|\<bbb-K\><around*|[|x|]>/<around*|(|x<rsup|n>|)>>,
            <math|\<ldots\>>
          </hidden*>|<\hidden*>
            <item><math|\<bbb-Z\><rsub|64>/<around*|(|m*\<bbb-Z\><rsub|64>|)>>,
            <math|\<ldots\>>
          </hidden*>|<\hidden*>
            <item><math|Recorder<around*|(|\<bbb-K\>|)>><vspace|0.5fn>
          </hidden*>>
        </itemize>
      </hidden*>>
    </compressed>

    \;
  </hidden>|<\hidden>
    <tit|Operations>

    <\compressed>
      <unroll|<\shown>
        <strong|Signature <math|<with|color|#ffa|\<Sigma\>>>:> the supported
        operations <math|\<sigma\>:\<bbb-K\><rsup|<around*|\||\<sigma\>|\|>>\<rightarrow\>\<bbb-K\>>

        <\equation*>
          \<Sigma\>\<assign\>\<Sigma\><rsub|basic>\<cup\>\<Sigma\><rsub|ext>
        </equation*>
      </shown>|<\hidden*>
        <strong|<math|<with|color|#ffa|\<Sigma\><rsub|basic>>>:> operations
        that most domains <math|\<bbb-K\>> supports

        <\overlays-phantoms|1|4>
          <\eqnarray*>
            <tformat|<table|<row|<cell|\<Sigma\><rsub|basic>>|<cell|\<supseteq\>>|<cell|<around*|{|move,neg,add,sub,sqr,mul,nmul,fma,fnma,\<ldots\>|}>>>|<row|<cell|>|<cell|>|<cell|\<nosymbol\><overlay-from|2|\<cup\><around*|{|inv,div,\<ldots\>|}>|>>>|<row|<cell|>|<cell|>|<cell|\<nosymbol\><overlay-from|3|\<cup\><around*|{|min,max,abs,\<ldots\>|}>\<cup\><around*|{|floor,ceil,round,trunc|}>|>>>|<row|<cell|>|<cell|>|<cell|\<nosymbol\><overlay-from|4|\<cup\><around*|{|not,or,xor,and|}>\<cup\><around*|{|eq,neq,lt,le,gt,ge|}>|>>>>>
          </eqnarray*>
        </overlays-phantoms>
      </hidden*>|<\hidden*>
        <strong|<math|<with|color|#ffa|\<Sigma\><rsub|ext>>>:> further
        operations for special domains <math|\<bbb-K\>> and user extensions

        <\eqnarray*>
          <tformat|<table|<row|<cell|\<Sigma\><rsub|ext>>|<cell|\<supseteq\>>|<cell|<around*|{|duplicate,permute|}>>>|<row|<cell|>|<cell|>|<cell|\<nosymbol\>\<cup\><around*|{|shl,shr,\<ldots\>|}>>>|<row|<cell|>|<cell|>|<cell|\<nosymbol\>\<cup\><around*|{|addc,subc,fmac,\<ldots\>|}>>>>>
        </eqnarray*>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Special operations>

    <\compressed>
      <unroll|<\shown>
        <strong|Embedding <math|<with|color|pastel
        yellow|Boolean\<hookrightarrow\>\<bbb-K\>>>>

        <\itemize>
          <unroll|<\shown>
            <item>Needed for <math|eq,neq,\<ldots\>,not,or,\<ldots\>>
          </shown>|<\hidden*>
            <item>Usually, natural implementation: for
            <math|\<bbb-K\>=\<bbb-Z\><rsub|64>>, take
            <math|false\<mapsto\>0>, <math|true\<mapsto\>-1>.<vspace|0.75fn>
          </hidden*>>
        </itemize>
      </shown>|<\hidden*>
        <strong|Embedding <math|<with|color|pastel
        yellow|\<bbb-Z\>\<hookrightarrow\>\<bbb-K\>>> and
        <math|<with|color|pastel yellow|\<bbb-Z\><rsup|w>\<hookrightarrow\>\<bbb-K\><rsup|w>>>>

        <\itemize>
          <unroll|<\shown>
            <item>For <math|shl,shr,\<ldots\>>, where
            <math|shl<around*|(|a,n|)>\<assign\>a*2<rsup|n>>
          </shown>|<\hidden*>
            <item>For <math|permute> on SIMD types, with
            <math|permute<around*|(|a,\<pi\>|)>\<assign\><around*|(|a<rsub|\<pi\><around*|(|0|)>>,\<ldots\>,a<rsub|\<pi\><around*|(|w-1|)>>|)>><vspace|0.75fn>
          </hidden*>>
        </itemize>
      </hidden*>|<\hidden*>
        <strong|Multi-sorted signatures>

        <\itemize>
          <unroll|<\shown>
            <item>We can also create union domains like
            <math|\<bbb-K\>\<cup\>\<bbb-L\>>
          </shown>|<\hidden*>
            <item>And introduce variants <math|\<sigma\><rsub|\<bbb-K\>>,\<sigma\><rsub|\<bbb-L\>>,\<ldots\>>
            of <math|\<sigma\>\<in\>\<Sigma\>> depending on the sort
            <math|\<bbb-K\>>, <math|\<bbb-L\>>
          </hidden*>|<\hidden*>
            <item>And versions <math|\<sigma\><rsub|condional>> with an extra
            <math|Boolean> argument (if <math|\<bbb-L\>=Boolean>)
          </hidden*>>
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Why include an operation like <math|fma> in<math|>
    <math|\<Sigma\><rsub|basic>> ?>

    <\compact>
      <unroll|<\shown>
        <strong|<with|color|red|Cons>>

        <\itemize>
          <unroll|<\shown>
            <item>Need to implement <math|fma> for many software domains
          </shown>|<\hidden*>
            <item>We could write a \Pclever\Q routine to simplify
            <math|a*b+c\<longrightarrow\>fma<around*|(|a,b,c|)>><vspace|0.5fn>
          </hidden*>>
        </itemize>
      </shown>|<\hidden*>
        <strong|<with|color|green|Pros>>

        <\itemize>
          <unroll|<\shown>
            <item><math|fma> corresponds to an important instruction in
            hardware
          </shown>|<\hidden*>
            <item><math|fma<around*|(|a,b,c|)>> does more than <math|a*b+c>
            for <math|\<bbb-R\><rsub|32>> and <math|\<bbb-R\><rsub|64>>
          </hidden*>|<\hidden*>
            <item>Systematic support of <math|\<pm\>x>, <math|\<pm\>x*y>,
            <math|\<pm\>x*y\<pm\>z> tends to yield better simplifications
          </hidden*>|<\hidden*>
            <item>On <math|\<bbb-Z\>/p*\<bbb-Z\>> via
            <math|\<bbb-R\><rsub|64>>, better implementation of
            <math|fma<around*|(|a,b,c|)>> than <math|a*b+c>:

            <\equation*>
              <tabular*|<tformat|<cwith|1|-1|1|1|cell-halign|r>|<cwith|1|-1|3|3|cell-halign|l>|<cwith|1|1|1|1|cell-row-span|1>|<cwith|1|1|1|1|cell-col-span|3>|<cwith|1|1|1|1|cell-halign|c>|<table|<row|<cell|<with|color|#fa6|reduce<around*|(|a|)>>>|<cell|>|<cell|>>|<row|<cell|q>|<cell|\<assign\>>|<cell|a\<cdot\>u>>|<row|<cell|q>|<cell|\<assign\>>|<cell|round<around*|(|q|)>>>|<row|<cell|r>|<cell|\<assign\>>|<cell|fnma<around*|(|p,q,r|)>>>|<row|<cell|>|<cell|>|<cell|>>|<row|<cell|>|<cell|>|<cell|>>>>>\<nocomma\><space|3em><tabular|<tformat|<cwith|1|-1|1|1|cell-halign|c>|<cwith|1|-1|3|3|cell-halign|l>|<cwith|1|1|1|1|cell-row-span|1>|<cwith|1|1|1|1|cell-col-span|3>|<table|<row|<cell|<with|color|#fa6|a*b+c>>|<cell|>|<cell|>>|<row|<cell|h>|<cell|\<assign\>>|<cell|a\<cdot\>b>>|<row|<cell|l>|<cell|\<assign\>>|<cell|fms<around*|(|a,b,h|)>>>|<row|<cell|r>|<cell|\<assign\>>|<cell|reduce<around*|(|h|)>>>|<row|<cell|r>|<cell|\<assign\>>|<cell|l+r>>|<row|<cell|r>|<cell|\<assign\>>|<cell|r+c>>>>>\<nocomma\><space|3em><tabular|<tformat|<cwith|1|-1|1|1|cell-halign|c>|<cwith|1|-1|3|3|cell-halign|l>|<cwith|5|5|1|1|cell-halign|c>|<cwith|5|5|3|3|cell-halign|l>|<cwith|1|1|1|1|cell-row-span|1>|<cwith|1|1|1|1|cell-col-span|3>|<table|<row|<cell|<with|color|#fa6|fma<around*|(|a,b,c|)>>>|<cell|>|<cell|>>|<row|<cell|h>|<cell|\<assign\>>|<cell|fma<around*|(|a,b,c|)>>>|<row|<cell|l>|<cell|\<assign\>>|<cell|fms<around*|(|a,b,h|)>>>|<row|<cell|l>|<cell|\<assign\>>|<cell|l+c>>|<row|<cell|r>|<cell|\<assign\>>|<cell|reduce<around*|(|h|)>>>|<row|<cell|r>|<cell|\<assign\>>|<cell|l+r>>>>>
            </equation*>
          </hidden*>>
        </itemize>
      </hidden*>>
    </compact>
  </hidden>|<\hidden>
    \;

    \;

    \;

    <\center>
      <with|frame-recolor|#fa6|frame-hpadding|2tab|frame-vpadding|1tab|<\black-floral1-frame>
        <\compressed>
          <\center>
            <strong|<with|color|#aff|Part III>>

            <strong|Using JIL>
          </center>
        </compressed>
      </black-floral1-frame>>
    </center>

    \;
  </hidden>|<\hidden>
    <tit|Typical workflow>

    \;

    <\overlays-greyed|1|5>
      <\alter-colors>
        <\wide-tabular>
          <tformat|<cwith|1|1|1|1|cell-halign|c>|<table|<row|<\cell>
            <math|<text|<math|<alternate-this|2|<long-arrow|\<rubber-rightarrow\>|initiate>|>>
            SLP <math|f<rsub|1>> <math|<alternate-this|3|<long-arrow|\<rubber-rightarrow\>|<text|transforms>>|>>
            SLP <math|f<rsub|2>> <math|<alternate-this|4|<long-arrow|\<rubber-rightarrow\>|optimize>|>>
            SLP <math|f<rsub|3>> <math|<alternate-this|5|<long-arrow|\<rubber-rightarrow\>|backend>|>>
            machine code><label|eq:workflow>>
          </cell>>>>
        </wide-tabular>

        \;

        Example: fast code for the cofactor matrix of a <math|4\<times\>4>
        matrix

        <\itemize>
          <item><alternate-this|2|<alter-colors|<strong|<alternate-this|2|Record|>>|red|#fa6>
          a program to compute a generic <math|<alternate-this|2|4\<times\>4|>>
          determinant <math|<alternate-this|2|\<longrightarrow\>|>> Initial
          SLP <math|<alternate-this|2|f<rsub|1>|>>|>

          <item><alternate-this|3|Compute gradient of
          <math|<alternate-this|3|f<rsub|1>|>>
          <math|<alternate-this|3|\<longrightarrow\>|>>
          <alter-colors|<strong|<alternate-this|3|algebraically
          transformed|>>|red|#fa6> SLP <math|<alternate-this|3|f<rsub|2>|>>|>

          <item><alternate-this|4|Simplify the result
          <math|<alternate-this|4|\<longrightarrow\>|>>
          <alter-colors|<strong|<alternate-this|4|optimized|>>|red|#fa6> SLP
          <math|<alternate-this|4|f<rsub|3>|>>|>

          <item><alternate-this|5|<alter-colors|<strong|<alternate-this|5|Compile|>>|red|#fa6>
          \ <math|<alternate-this|5|f<rsub|3>|>> for a specific architecture
          <math|<alternate-this|5|\<longrightarrow\>|>> binary code inside
          memory|>
        </itemize>
      </alter-colors|red|white>
    </overlays-greyed>
  </hidden>|<\hidden>
    <tit|Recording>

    <\compressed>
      <unroll|<\shown>
        Assume that we have a generic C++ function

        <\cpp-code>
          template\<less\>typename <with|color|#aff|C>\<gtr\>
          <with|color|#aff|C>

          <with|color|#fa6|f> (const <with|color|#aff|C>& x) { return x * x +
          x - 3; }
        </cpp-code>
      </shown>|<\hidden*>
        We may record an SLP for <cpp|<with|color|#fa6|f>> as follows:

        <\cpp-code>
          <with|color|#aff|slp>

          <with|color|#fa6|record_f> (const <with|color|#aff|domain>& tp) {

          \ \ <with|color|#ffa|recorder_start> (tp);

          \ \ <with|color|#aff|slp_variable> in =
          <with|color|#ffa|input_variable> ();

          \ \ <with|color|#aff|slp_variable> out= <with|color|#fa6|f> (in);

          \ \ <with|color|#ffa|specify_output> (out);

          \ \ return <with|color|#ffa|recorder_end> ();

          }
        </cpp-code>
      </hidden*>|<\hidden*>
        <\compact>
          <vspace*|0.3333fn><strong|Idea:> an instance of
          <cpp|<with|color|#aff|slp_variable>> specifies a data field on
          which to operate

          <phantom|<strong|Idea:> >operations on
          <cpp|<with|color|#aff|slp_variable>> are recorded instead of being
          executed
        </compact>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Recording \U example>

    <\compressed>
      <unroll|<\shown>
        Assume that we have a generic C++ function

        <\cpp-code>
          template\<less\>typename <with|color|#aff|C>\<gtr\>
          <with|color|#aff|C>

          <with|color|#fa6|f> (const <with|color|#aff|C>& x) { return x * x +
          x - 3; }
        </cpp-code>
      </shown>|<\hidden*>
        \;

        <em|Step by step execution>

        <switch|<\shown>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                \;

                \;

                \;

                \;

                \;

                \;

                \;
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|0>>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|4>>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|0>>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </shown>|<\hidden>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                in = <with|color|#ffa|input_variable> ()

                \;

                \;

                \;

                \;

                \;

                \;
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|4>>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </hidden>|<\hidden>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                in = <with|color|#ffa|input_variable> ()

                out= foo (in)

                \ \ x * x

                \;

                \;

                \;

                \;
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|4>>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|0>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </hidden>|<\hidden>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                in = <with|color|#ffa|input_variable> ()

                out= foo (in)

                \ \ x * x

                \ \ x * x + x

                \;

                \;

                \;
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|4>>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|0>|<cell|0>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </hidden>|<\hidden>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                in = <with|color|#ffa|input_variable> ()

                out= foo (in)

                \ \ x * x

                \ \ x * x + x

                \ \ 3

                \;

                \;
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|4>>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|0>|<cell|0>|<cell|3>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </hidden>|<\hidden>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                in = <with|color|#ffa|input_variable> ()

                out= foo (in)

                \ \ x * x

                \ \ x * x + x

                \ \ 3

                \ \ x * x + x - 3

                \;
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<phantom|<block|<tformat|<table|<row|<cell|4>>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|0>|<cell|0>|<cell|3>|<cell|0>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </hidden>|<\hidden>
          <\wide-tabular>
            <tformat|<table|<row|<\cell>
              <\cpp-code>
                <with|color|#ffa|recorder_start> (tp)

                in = <with|color|#ffa|input_variable> ()

                out= foo (in)

                \ \ x * x

                \ \ x * x + x

                \ \ 3

                \ \ x * x + x - 3

                <with|color|#ffa|output_variable> (out)
              </cpp-code>
            </cell>|<\cell>
              <tabular|<tformat|<cwith|1|3|1|-1|cell-bsep|1spc>|<cwith|1|4|1|1|cell-halign|r>|<table|<row|<cell|<with|color|#afa|in>>|<cell|<block|<tformat|<table|<row|<cell|0>>>>>>>|<row|<cell|<with|color|#afa|out>>|<cell|<block|<tformat|<table|<row|<cell|4>>>>>>>|<row|<cell|<with|color|#afa|prg>>|<cell|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>|<row|<cell|<with|color|#afa|data>>|<cell|<block|<tformat|<table|<row|<cell|0>|<cell|0>|<cell|0>|<cell|3>|<cell|0>>>>>>>|<row|<cell|>|<cell|<phantom|<block|<tformat|<table|<row|<cell|<math|\<times\>>>|<cell|1>|<cell|0>|<cell|0>|<cell|<math|+>>|<cell|2>|<cell|1>|<cell|0>|<cell|<math|->>|<cell|4>|<cell|2>|<cell|3>>>>>>>>>>>
            </cell>>>>
          </wide-tabular>
        </hidden>>
      </hidden*>|<\hidden*>
        <strong|Note:> clean execution trace more important than efficient
        execution of <cpp|<with|color|#fa6|f>>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Traditional transformations>

    <\compressed>
      <unroll|<\shown>
        <strong|Important:> by design, all transformations of SLP take linear
        time in JIL<vspace|0.5fn>
      </shown>|<\hidden*>
        <strong|Simplification>

        <\itemize>
          <item>Common subexpression elimination

          <item>CSE + algebraic simplifications (<math|0+x\<rightarrow\>x>,
          etc.)

          <item>Dead code elimination

          <item>Rewrite <math|a*b+c\<longrightarrow\>fma<around*|(|a,b,c|)>><vspace|0.5fn>
        </itemize>
      </hidden*>|<\hidden*>
        <strong|Backend>

        <\itemize>
          <item>Emulate missing instructions

          <item>Optimizations for immediate arguments

          <item>Rescheduling

          <item>Register allocation
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Algebraic transformations>

    <\compressed>
      <unroll|<\shown>
        <strong|General transformations>

        <\itemize>
          <item>Forward and backward differentiation:
          <math|f\<longmapsto\>Jac<around*|(|f|)>>

          <item>If <math|f> is linear (<abbr|w.r.t.> some of its inputs),
          then compute the transposed map

          <item><math|P\<in\>\<bbb-K\><around*|[|x<rsub|1>,\<ldots\>,x<rsub|n>|]>>
          <math|<with|color|white|\<longmapsto\>>> homogeneous
          <math|<wide|P|~>\<in\>\<bbb-K\><around*|[|x<rsub|1>,\<ldots\>,x<rsub|n>,t|]>>
          with <math|P<around*|(|\<b-x\>|)>=<wide|P|~><around*|(|\<b-x\>,1|)>>

          <item>Add just enough reductions to avoid overflows for redundant
          <hgroup|representations><vspace|0.6666fn>
        </itemize>
      </shown>|<\hidden*>
        <strong|Lifting and related transformations>

        <\itemize>
          <item>Lift SLP over <math|\<bbb-K\>> to SLP over <math|\<bbb-A\>>
          for <math|\<bbb-K\>>-algebra <math|\<bbb-A\>>

          <item>Reinterpret SLP over <math|\<bbb-A\>> as SLP over
          <math|\<bbb-K\>>

          <item>Specific vector and ball lifts

          <item>Reduce the number of divisions in SLPs
        </itemize>
      </hidden*>>
    </compressed>
  </hidden>|<\hidden>
    <tit|Example of complexification>

    <switch|<\shown>
      <\compressed>
        <math|\<bbb-K\><around*|[|\<mathi\>|]>> is an <strong|SLP algebra>:
        operations in <math|\<Sigma\><rsub|basic>> can be implemented using
        SLPs:

        <\equation*>
          <tabular|<tformat|<cwith|1|1|1|1|cell-halign|l>|<cwith|1|1|1|1|cell-background|#022>|<table|<row|<cell|<text|add>>>|<row|<cell|in<around*|(|<with|color|#aff|x<rsub|1>>,<with|color|#aff|y<rsub|1>>,<with|color|#aff|x<rsub|2>>,<with|color|#aff|y<rsub|2>>|)>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|1>+x<rsub|2>>>|<row|<cell|y<rsub|3>\<assign\>y<rsub|1>+y<rsub|2>>>|<row|<cell|out<around*|(|<with|color|#fa6|x<rsub|3>>,<with|color|#fa6|y<rsub|3>>|)>>>|<row|<cell|>>|<row|<cell|>>>>>\<nocomma\><space|2em><tabular|<tformat|<cwith|1|1|1|1|cell-halign|l>|<cwith|1|1|1|1|cell-background|#022>|<table|<row|<cell|<text|subtract>>>|<row|<cell|in<around*|(|<with|color|#aff|x<rsub|1>>,<with|color|#aff|y<rsub|1>>,<with|color|#aff|x<rsub|2>>,<with|color|#aff|y<rsub|2>>|)>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|1>+x<rsub|2>>>|<row|<cell|y<rsub|3>\<assign\>y<rsub|1>+y<rsub|2>>>|<row|<cell|out<around*|(|<with|color|#fa6|x<rsub|3>>,<with|color|#fa6|y<rsub|3>>|)>>>|<row|<cell|>>|<row|<cell|>>>>>\<nocomma\><space|2em><tabular|<tformat|<cwith|1|1|1|1|cell-halign|l>|<cwith|1|1|1|1|cell-background|#022>|<table|<row|<cell|<text|multiply>>>|<row|<cell|in<around*|(|<with|color|#aff|x<rsub|1>>,<with|color|#aff|y<rsub|1>>,<with|color|#aff|x<rsub|2>>,<with|color|#aff|y<rsub|2>>|)>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|1>\<cdot\>x<rsub|2>>>|<row|<cell|y<rsub|3>\<assign\>x<rsub|1>\<cdot\>y<rsub|2>>>|<row|<cell|x<rsub|3>\<assign\>fms<around*|(|y<rsub|1>,y<rsub|2>,x<rsub|3>|)>>>|<row|<cell|y<rsub|3>\<assign\>fma<around*|(|x<rsub|2>,y<rsub|1>,y<rsub|3>|)>>>|<row|<cell|out<around*|(|<with|color|#fa6|x<rsub|3>>,<with|color|#fa6|y<rsub|3>>|)>>>>>>
        </equation*>

        Be careful with <em|aliasing> like in <math|z\<assign\>u\<cdot\>z>
      </compressed>
    </shown>|<\hidden>
      <\compressed>
        <math|\<bbb-K\><around*|[|\<mathi\>|]>> is an <strong|SLP algebra>:
        operations in <math|\<Sigma\><rsub|basic>> can be implemented using
        SLPs:

        <\small>
          <\equation*>
            <tabular|<tformat|<cwith|1|1|1|1|cell-halign|c>|<table|<row|<cell|<text|add>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|1>+x<rsub|2>>>|<row|<cell|y<rsub|3>\<assign\>y<rsub|1>+y<rsub|2>>>|<row|<cell|>>|<row|<cell|>>>>>\<nocomma\><space|2em><tabular|<tformat|<cwith|1|1|1|1|cell-halign|c>|<table|<row|<cell|<text|subtract>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|1>+x<rsub|2>>>|<row|<cell|y<rsub|3>\<assign\>y<rsub|1>+y<rsub|2>>>|<row|<cell|>>|<row|<cell|>>>>>\<nocomma\><space|2em><tabular|<tformat|<cwith|1|1|1|1|cell-halign|c>|<table|<row|<cell|<text|multiply>>>|<row|<cell|x<rsub|3>\<assign\>x<rsub|1>\<cdot\>x<rsub|2>>>|<row|<cell|y<rsub|3>\<assign\>x<rsub|1>\<cdot\>y<rsub|2>>>|<row|<cell|x<rsub|3>\<assign\>fms<around*|(|y<rsub|1>,y<rsub|2>,x<rsub|3>|)>>>|<row|<cell|y<rsub|3>\<assign\>fma<around*|(|x<rsub|2>,y<rsub|1>,y<rsub|3>|)>>>>>>
          </equation*>
        </small>

        <\small>
          <\equation*>
            <tabular|<tformat|<cwith|2|2|1|1|cell-background|#002>|<cwith|3|3|1|1|cell-background|#020>|<cwith|4|4|1|1|cell-background|#200>|<table|<row|<cell|in<around*|(|z<rsub|0>|)>>>|<row|<cell|z<rsub|1>\<assign\>z<rsub|0>\<cdot\>z<rsub|0>>>|<row|<cell|z<rsub|2>\<assign\>z<rsub|1>+z<rsub|0>>>|<row|<cell|z<rsub|4>\<assign\>z<rsub|2>-7>>|<row|<cell|out<around*|(|z<rsub|4>|)>>>>>><space|1em><long-arrow|\<rubber-rightarrow\>|lift><space|1em><tabular|<tformat|<cwith|6|6|1|1|cell-halign|l>|<cwith|2|5|1|1|cell-background|#002>|<cwith|6|7|1|1|cell-background|#020>|<cwith|8|9|1|1|cell-background|#200>|<table|<row|<cell|in<around*|(|x<rsub|0>,y<rsub|0>|)>>>|<row|<cell|x<rsub|1>\<assign\>x<rsub|0>\<cdot\>x<rsub|0>>>|<row|<cell|y<rsub|1>\<assign\>x<rsub|0>\<cdot\>y<rsub|0>>>|<row|<cell|x<rsub|1>\<assign\>fms<around*|(|y<rsub|0>,y<rsub|0>,x<rsub|1>|)>>>|<row|<cell|y<rsub|1>\<assign\>fma<around*|(|x<rsub|0>,y<rsub|0>,y<rsub|1>|)>>>|<row|<cell|x<rsub|2>\<assign\>x<rsub|1>+x<rsub|0>>>|<row|<cell|y<rsub|2>\<assign\>y<rsub|1>+y<rsub|0>>>|<row|<cell|x<rsub|4>\<assign\>x<rsub|2>-7>>|<row|<cell|y<rsub|4>\<assign\>y<rsub|2>-0>>|<row|<cell|out<around*|(|x<rsub|4>,y<rsub|4>|)>>>>>><space|1em><long-arrow|\<rubber-rightarrow\>|simplify><space|1em><tabular|<tformat|<cwith|6|6|1|1|cell-halign|l>|<cwith|2|5|1|1|cell-background|#002>|<cwith|6|7|1|1|cell-background|#020>|<cwith|8|10|1|1|cell-background|#200>|<cwith|9|9|1|1|cell-background|>|<table|<row|<cell|in<around*|(|x<rsub|0>,y<rsub|0>|)>>>|<row|<cell|x<rsub|1>\<assign\>x<rsub|0>\<cdot\>x<rsub|0>>>|<row|<cell|y<rsub|1>\<assign\>x<rsub|0>\<cdot\>y<rsub|0>>>|<row|<cell|x<rsub|1>\<assign\>fms<around*|(|y<rsub|0>,y<rsub|0>,x<rsub|1>|)>>>|<row|<cell|y<rsub|1>\<assign\>fma<around*|(|x<rsub|0>,y<rsub|0>,y<rsub|1>|)>>>|<row|<cell|x<rsub|2>\<assign\>x<rsub|1>+x<rsub|0>>>|<row|<cell|y<rsub|2>\<assign\>y<rsub|1>+y<rsub|0>>>|<row|<cell|x<rsub|4>\<assign\>x<rsub|2>-7>>|<row|<cell|out<around*|(|x<rsub|4>,y<rsub|2>|)>>>>>>
          </equation*>
        </small>
      </compressed>
    </hidden>>
  </hidden>|<\hidden>
    <tit|Some timings>

    <\big-table*|<normal-size|<tabular|<tformat|<cwith|10|10|1|1|cell-bborder|0ln>|<cwith|1|-1|1|1|cell-lborder|0ln>|<cwith|1|-1|1|1|cell-rborder|1ln>|<cwith|1|-1|2|2|cell-lborder|1ln>|<cwith|1|-1|9|9|cell-lborder|0ln>|<cwith|1|-1|8|8|cell-rborder|0ln>|<cwith|1|-1|9|9|cell-rborder|1ln>|<cwith|1|1|1|-1|cell-tborder|0ln>|<cwith|1|1|1|-1|cell-bborder|1ln>|<cwith|2|2|1|-1|cell-tborder|1ln>|<cwith|1|1|1|1|cell-lborder|0ln>|<cwith|2|-1|2|9|cell-halign|r>|<cwith|2|-1|1|1|cell-halign|r>|<cwith|1|1|1|-1|cell-halign|c>|<cwith|10|10|9|9|cell-bborder|0ln>|<cwith|2|-1|9|9|cell-lborder|0ln>|<cwith|2|-1|8|8|cell-rborder|0ln>|<cwith|2|-1|9|9|cell-rborder|0ln>|<cwith|1|1|9|9|cell-tborder|0ln>|<cwith|1|1|9|9|cell-bborder|1ln>|<cwith|2|2|9|9|cell-tborder|1ln>|<cwith|1|1|9|9|cell-lborder|0ln>|<cwith|1|1|8|8|cell-rborder|0ln>|<cwith|1|1|9|9|cell-rborder|0ln>|<table|<row|<cell|<math|n>>|<cell|len>|<cell|cse>|<cell|sim>|<cell|<math|\<nabla\>>>|<cell|lift>|<cell|reg>|<cell|jit>|<cell|exe>>|<row|<cell|2>|<cell|2>|<cell|1460>|<cell|2687>|<cell|1221>|<cell|1796>|<cell|1436>|<cell|12784>|<cell|10.00>>|<row|<cell|3>|<cell|9>|<cell|543>|<cell|825>|<cell|407>|<cell|614>|<cell|562>|<cell|2963>|<cell|2.222>>|<row|<cell|4>|<cell|40>|<cell|229>|<cell|341>|<cell|184>|<cell|262>|<cell|291>|<cell|924>|<cell|0.550>>|<row|<cell|5>|<cell|205>|<cell|126>|<cell|196>|<cell|125>|<cell|163>|<cell|237>|<cell|452>|<cell|0.424>>|<row|<cell|6>|<cell|1236>|<cell|97>|<cell|149>|<cell|112>|<cell|138>|<cell|221>|<cell|455>|<cell|0.422>>|<row|<cell|7>|<cell|8659>|<cell|89>|<cell|144>|<cell|127>|<cell|134>|<cell|229>|<cell|447>|<cell|0.419>>|<row|<cell|8>|<cell|69280>|<cell|94>|<cell|171>|<cell|125>|<cell|160>|<cell|261>|<cell|470>|<cell|0.424>>|<row|<cell|9>|<cell|623529>|<cell|133>|<cell|207>|<cell|188>|<cell|178>|<cell|347>|<cell|472>|<cell|0.865>>|<row|<cell|10>|<cell|6235300>|<cell|158>|<cell|242>|<cell|296>|<cell|245>|<cell|391>|<cell|522>|<cell|3.308>>>>>>>
      <label|tab:slp-xeon>Timings in cycles per instruction on <name|Intel>
      <name|Xeon> for (very) naive <math|n\<times\>n> determinants.
    </big-table*>
  </hidden>|<\hidden>
    \;

    \;

    \;

    <\center>
      <with|frame-recolor|#fa6|frame-hpadding|2tab|frame-vpadding|1tab|<\black-floral1-frame>
        <\compressed>
          <\center>
            <strong|<with|color|#aff|Part IV>>

            <strong|Upcoming features>
          </center>
        </compressed>
      </black-floral1-frame>>
    </center>

    \;
  </hidden>|<\hidden>
    <tit|Integrated benching, selection, and caching>

    <unroll|<\shown>
      We can efficiently generated multiple implementations of a function
      and<new-line>determine which one is best by benching each
      implementation<vspace|1fn>
    </shown>|<\hidden*>
      <\compressed>
        This requires

        <\itemize>
          <item>Good random sample generators

          <item>Mechanism to list alternative implementations and bench them

          <item>Mechanism to cache results on disk

          <item>Mechanism to predict timings without running any code
        </itemize>
      </compressed>
    </hidden*>>
  </hidden>|<\hidden>
    <tit|Beyond SLPs>

    <unroll|<\shown>
      Typical use cases:

      <\compressed>
        <\itemize>
          <unroll|<\shown>
            <item>Run SLP <math|f:\<bbb-K\><rsup|m>\<rightarrow\>\<bbb-K\><rsup|n>>
            on vectors: <math|<wide|f|~>:<around*|(|\<bbb-K\><rsup|m>|)><rsup|N>\<longrightarrow\><around*|(|\<bbb-K\><rsup|n>|)><rsup|N>>
            with <math|N\<gg\>1>
          </shown>|<\hidden*>
            <item>Iterate SLP <math|f:\<bbb-K\><rsup|n>\<rightarrow\>\<bbb-K\><rsup|n>>
            until condition is met (<abbr|e.g.> homotopy continuation)
          </hidden*>|<\hidden*>
            <item>SLP over \Pbig\Q SLP algebra <math|\<bbb-A\>> over
            <math|\<bbb-K\>> <math|\<longrightarrow\>> subroutines for
            operations in <math|\<bbb-A\>>
          </hidden*>>
        </itemize>
      </compressed>
    </shown>|<\hidden*>
      Need for <em|shallow> control structures
    </hidden*>|<\hidden*>
      Dedicated support for various frequent
      patters<text-dots><new-line><text-dots> or better to use general
      purpose techniques?
    </hidden*>>
  </hidden>|<\hidden>
    <tit|Example on AVX512 platform>

    <unroll|<\shown>
      <\compressed>
        <strong|Input:> <math|N\<in\>\<bbb-N\>> and vectors
        <math|u,v\<in\>\<bbb-R\><rsub|64><around*|[|\<mathi\>|]><rsup|N>>

        <strong|Output:> <math|w\<in\>\<bbb-R\><rsub|64><around*|[|\<mathi\>|]><rsup|N>>
        with <math|w<rsub|i>=u<rsub|i>*v<rsub|i>> for all
        <math|0\<leqslant\>i\<less\>N>
      </compressed>
    </shown>|<\hidden*>
      First assume that <math|N=32>.
    </hidden*>|<\hidden*>
      <\compressed>
        <\itemize>
          <item>Reinterpret <math|u,v> as
          <math|<wide|u|~>,<wide|v|~>\<in\><around*|(|\<bbb-R\><rsub|64><rsup|8>|)><rsup|4><around*|[|\<mathi\>|]>>
          (this requires SIMD matrix transposition)

          <item>Lift complex multiplication over <math|\<bbb-R\><rsub|64>> to
          multiplication in <math|<around*|(|\<bbb-R\><rsub|64><rsup|8>|)><rsup|4><around*|[|\<mathi\>|]>>
          over <math|\<bbb-R\><rsub|64><rsup|8>>

          <item>Reinterpret <math|<wide|w|~>\<assign\><wide|u|~>*<wide|v|~>>
          as an element of <math|\<bbb-R\><rsub|64><around*|[|\<mathi\>|]>>
          (another SIMD transposition)
        </itemize>
      </compressed>
    </hidden*>|<\hidden*>
      Create a loop which repeats this code until
      <math|N\<less\>32>.<tiny-switch|<shown|>|<hidden| Next>>
    </hidden*>|<\hidden*>
      <\compressed>
        <\itemize>
          <item>Reduce further to the cases when <math|N\<less\>16> and then
          <math|N\<less\>8>

          <item>Reduce the case when <math|N\<less\>8>
          <math|\<longrightarrow\>> case <math|N=8> with an appropriate mask
        </itemize>
      </compressed>
    </hidden*>|<\hidden*>
      This whole implementation can be run using only AVX512 vector
      instructions.\ 
    </hidden*>>
  </hidden>|<\hidden>
    <\center>
      <\with|par-par-sep|0.3333fn>
        <\center>
          <really-huge|<strong|<with|color|orange|Thank you !>>>
        </center>
      </with>

      <tabular|<tformat|<cwith|1|2|1|1|cell-halign|c>|<cwith|1|1|1|1|cell-tsep|4spc>|<cwith|1|1|1|1|cell-hyphen|t>|<cwith|2|2|1|1|cell-tsep|1em>|<table|<row|<\cell>
        <with|shadow-dx|0ln|shadow-dy|0ln|shadow-color|white|shadow-blur-radius|0.25ln|<shadow|<image|Download-TeXmacs.png|0.62par|||>>>
      </cell>>|<row|<cell|<with|color|#ffa|<verbatim|http://www.<anim-repeat|<anim-compose|<anim-constant|<with|color|orange|T<space|-0.2spc><with|color|orange|<rsub|<with|math-level|0|font-shape|small-caps|e>>>X<space|-0.2spc>><rsub|<with|math-level|0|font-shape|small-caps|ma<space|-0.2spc>cs>>|1sec>|<anim-constant|T<space|-0.2spc><with|color|orange|<rsub|<with|math-level|0|font-shape|small-caps|e>>>X<space|-0.2spc><with|color|orange|<rsub|<with|math-level|0|font-shape|small-caps|ma<space|-0.2spc>cs>>>|1sec>>>.org>>>>>>>
    </center>

    \;
  </hidden>>
</body>

<\initial>
  <\collection>
    <associate|body-color|#002020>
    <associate|dark-vador-input-deco-ornament-vpadding|0.5spc>
    <associate|eqn-long-above|<macro|0fn>>
    <associate|eqn-long-below|<macro|0fn>>
    <associate|eqn-short-above|<macro|0fn>>
    <associate|eqn-short-below|<macro|0fn>>
    <associate|font|greek=pagella,math=pagella,typewriter=roman,Linux
    Biolinum>
    <associate|font-family|rm>
    <associate|info-flag|none>
    <associate|locus-color|#aaf>
    <associate|magnification|2>
    <associate|math-font|math-pagella>
    <associate|ornament-corner|50%>
    <associate|ornament-hpadding|1.5spc>
    <associate|ornament-vpadding|1.5spc>
    <associate|page-bot|-5mm>
    <associate|page-height|auto>
    <associate|page-medium|beamer>
    <associate|page-type|8:5>
    <associate|page-width|auto>
    <associate|preamble|false>
  </collection>
</initial>

<\references>
  <\collection>
    <associate|auto-1|<tuple|<with|mode|<quote|math>|\<bullet\>>|?>>
    <associate|eq:intro-slp|<tuple|?|?>>
    <associate|eq:workflow|<tuple|<with|mode|<quote|math>|\<bullet\>>|?>>
    <associate|tab:slp-xeon|<tuple|<with|mode|<quote|math>|\<bullet\>>|?>>
  </collection>
</references>

<\auxiliary>
  <\collection>
    <\associate|table>
      <\tuple|normal>
        Timings in cycles per instruction on
        <with|font-family|<quote|rm>|font-shape|<quote|small-caps>|Intel>
        <with|font-family|<quote|rm>|font-shape|<quote|small-caps>|Xeon> for
        (very) naive <with|color|<quote|pastel
        red>|font-family|<quote|rm>|<with|mode|<quote|math>|n\<times\>n>>
        determinants.
      </tuple|<pageref|auto-1>>
    </associate>
  </collection>
</auxiliary>