hynky HF staff commited on
Commit
1fa3117
·
1 Parent(s): 566dccd

add dist + nits

Browse files
Files changed (47) hide show
  1. dist/assets/data/fp8/.DS_Store +0 -0
  2. dist/assets/images/5D_nutshell_tp_sp.svg +1 -1
  3. dist/assets/images/5d_full.svg +0 -0
  4. dist/assets/images/5d_nutshell_cp.svg +1 -1
  5. dist/assets/images/5d_nutshell_ep.svg +0 -0
  6. dist/assets/images/activation_recomputation.svg +1 -1
  7. dist/assets/images/cp_70bmemoryusage.svg +0 -0
  8. dist/assets/images/cp_attnmask.svg +0 -0
  9. dist/assets/images/cp_overlap_all2all.svg +1 -1
  10. dist/assets/images/cp_overlap_allgather.svg +1 -1
  11. dist/assets/images/cp_zigzagmask.svg +0 -0
  12. dist/assets/images/diving_primergpu.svg +0 -0
  13. dist/assets/images/diving_primergpu2.svg +0 -0
  14. dist/assets/images/dp_overlap1.svg +1 -1
  15. dist/assets/images/dp_overlap2.svg +1 -1
  16. dist/assets/images/dp_overlap3.svg +1 -1
  17. dist/assets/images/dp_zero1_overlap.svg +1 -1
  18. dist/assets/images/dp_zero2_overlap.svg +1 -1
  19. dist/assets/images/dp_zero3_bwd.svg +0 -0
  20. dist/assets/images/dp_zero3_fwd.svg +0 -0
  21. dist/assets/images/dp_zero3_overlap.svg +1 -1
  22. dist/assets/images/first_steps_memory_profile.svg +0 -0
  23. dist/assets/images/memory_profile.svg +0 -0
  24. dist/assets/images/pp_1f1b.svg +0 -0
  25. dist/assets/images/pp_1f1b_interleaved.svg +0 -0
  26. dist/assets/images/pp_afab.svg +1 -1
  27. dist/assets/images/pp_afab2.svg +0 -0
  28. dist/assets/images/tp_diagram.svg +1 -1
  29. dist/assets/images/tp_overlap.svg +1 -1
  30. dist/assets/images/tp_sp_overlap.svg +1 -1
  31. dist/assets/images/ultra-cheatsheet.svg +0 -0
  32. dist/assets/images/zero_memory.svg +0 -0
  33. dist/fragments/cp_8Bmemoryusage.html +1 -0
  34. dist/fragments/dp_ourjourney_memoryusage.html +1 -0
  35. dist/fragments/dp_scaling.html +1 -0
  36. dist/fragments/memusage_activations.html +1 -0
  37. dist/fragments/pp_bubblesize.html +1 -0
  38. dist/fragments/pp_comm_bandwidth.html +1 -0
  39. dist/fragments/pp_memoryusage.html +1 -0
  40. dist/fragments/tp_memoryusage.html +1 -0
  41. dist/fragments/tp_scaling.html +1 -0
  42. dist/fragments/tp_sp_memoryusage.html +1 -0
  43. dist/fragments/tp_sp_scaling.html +1 -0
  44. dist/fragments/zero3_memoryusage.html +1 -0
  45. dist/index.html +56 -45
  46. src/index.html +56 -46
  47. webpack.config.js +8 -0
dist/assets/data/fp8/.DS_Store ADDED
Binary file (6.15 kB). View file
 
dist/assets/images/5D_nutshell_tp_sp.svg CHANGED
dist/assets/images/5d_full.svg CHANGED
dist/assets/images/5d_nutshell_cp.svg CHANGED
dist/assets/images/5d_nutshell_ep.svg CHANGED
dist/assets/images/activation_recomputation.svg CHANGED
dist/assets/images/cp_70bmemoryusage.svg CHANGED
dist/assets/images/cp_attnmask.svg CHANGED
dist/assets/images/cp_overlap_all2all.svg CHANGED
dist/assets/images/cp_overlap_allgather.svg CHANGED
dist/assets/images/cp_zigzagmask.svg CHANGED
dist/assets/images/diving_primergpu.svg CHANGED
dist/assets/images/diving_primergpu2.svg CHANGED
dist/assets/images/dp_overlap1.svg CHANGED
dist/assets/images/dp_overlap2.svg CHANGED
dist/assets/images/dp_overlap3.svg CHANGED
dist/assets/images/dp_zero1_overlap.svg CHANGED
dist/assets/images/dp_zero2_overlap.svg CHANGED
dist/assets/images/dp_zero3_bwd.svg CHANGED
dist/assets/images/dp_zero3_fwd.svg CHANGED
dist/assets/images/dp_zero3_overlap.svg CHANGED
dist/assets/images/first_steps_memory_profile.svg CHANGED
dist/assets/images/memory_profile.svg CHANGED
dist/assets/images/pp_1f1b.svg CHANGED
dist/assets/images/pp_1f1b_interleaved.svg CHANGED
dist/assets/images/pp_afab.svg CHANGED
dist/assets/images/pp_afab2.svg CHANGED
dist/assets/images/tp_diagram.svg CHANGED
dist/assets/images/tp_overlap.svg CHANGED
dist/assets/images/tp_sp_overlap.svg CHANGED
dist/assets/images/ultra-cheatsheet.svg CHANGED
dist/assets/images/zero_memory.svg CHANGED
dist/fragments/cp_8Bmemoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=995b326f-ddf4-41a8-b9e5-fce1da175b3f class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("995b326f-ddf4-41a8-b9e5-fce1da175b3f")&&Plotly.newPlot("995b326f-ddf4-41a8-b9e5-fce1da175b3f",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[59.828125,59.828125,59.828125,59.828125,59.828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[4.25,17,68,272,544],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[2.75,11,44,176,352],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[.6875,2.75,11,44,88],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray",title:{text:"Memory Usage (GB)"}},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=2 CP=1",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=2 CP=4",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage for 8B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:410},{responsive:!0})</script> </div>
dist/fragments/dp_ourjourney_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=9f507a17-fb27-4b9a-9224-34ffad9cd0d4 class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("9f507a17-fb27-4b9a-9224-34ffad9cd0d4")&&Plotly.newPlot("9f507a17-fb27-4b9a-9224-34ffad9cd0d4",[{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[2.3017578125,2.3017578125,2.3017578125,2.3017578125,2.3017578125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[2.3017578125,2.3017578125,2.3017578125,2.3017578125,2.3017578125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[9.20703125,9.20703125,9.20703125,9.20703125,9.20703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[1.0625,2.125,4.25,8.5,17],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[59.828125,59.828125,59.828125,59.828125,59.828125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[4.25,8.5,17,34,68],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[131.4140625,131.4140625,131.4140625,131.4140625,131.4140625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[131.4140625,131.4140625,131.4140625,131.4140625,131.4140625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[525.65625,525.65625,525.65625,525.65625,525.65625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[21.25,42.5,85,170,340],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889]},yaxis:{anchor:"x",domain:[0,1],range:[0,150],title:{text:"GB memory"}},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445]},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150]},xaxis3:{anchor:"y3",domain:[.7111111111111111,1]},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150]},annotations:[{font:{size:16},showarrow:!1,text:"1B model",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"8B model",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"70B model",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage vs Sequence Length for Different Model Sizes"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:410},{responsive:!0})</script> </div>
dist/fragments/dp_scaling.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=f6b00dd8-6230-46cf-9b38-f7fd425a1dd3 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("f6b00dd8-6230-46cf-9b38-f7fd425a1dd3")&&Plotly.newPlot("f6b00dd8-6230-46cf-9b38-f7fd425a1dd3",[{marker:{color:"#4ea5b7"},name:"Throughput (tokens/sec/GPU)",width:.7,x:["8","16","32","64","128","256"],y:[40149.94,37609.69,35367.61,31112.23,26446.44,15700.38],type:"bar",xaxis:"x",yaxis:"y"},{base:[37609.69],marker:{color:"#e889ab"},name:"Performance Drop",showlegend:!0,width:.0875,x:["16"],y:[2540.25],type:"bar",xaxis:"x",yaxis:"y"},{base:[35367.61],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["32"],y:[2242.0800000000017],type:"bar",xaxis:"x",yaxis:"y"},{base:[31112.23],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["64"],y:[4255.380000000001],type:"bar",xaxis:"x",yaxis:"y"},{base:[26446.44],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["128"],y:[4665.790000000001],type:"bar",xaxis:"x",yaxis:"y"},{base:[15700.38],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["256"],y:[10746.06],type:"bar",xaxis:"x",yaxis:"y"},{line:{color:"#e889ab"},marker:{color:"#e889ab"},mode:"lines+markers",name:"Memory Usage (GB)",x:["8","16","32","64","128","256"],y:[36.66,36.66,36.66,36.66,36.66,36.66],type:"scatter",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Data Parallelism (DP)"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Throughput (tokens/sec/GPU)"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Data Parallelism (DP)"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],title:{text:"Memory Usage (GB)"},showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Throughput Scaling with Data Parallelism",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Memory Usage Scaling with Data Parallelism",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{color:"#e889ab"},showarrow:!1,text:"-6.3%",x:1,xanchor:"center",xref:"x",xshift:30,y:38879.815,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-6.0%",x:2,xanchor:"center",xref:"x",xshift:30,y:36488.65,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-12.0%",x:3,xanchor:"center",xref:"x",xshift:30,y:33239.92,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-15.0%",x:4,xanchor:"center",xref:"x",xshift:30,y:28779.335,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-40.6%",x:5,xanchor:"center",xref:"x",xshift:30,y:21073.41,yanchor:"middle",yref:"y"}],legend:{x:.55,y:1},width:1e3,height:400,barmode:"stack"},{responsive:!0})</script> </div>
dist/fragments/memusage_activations.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=121faba8-d8ec-447e-9ab1-9a8fa34c1f63 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("121faba8-d8ec-447e-9ab1-9a8fa34c1f63")&&Plotly.newPlot("121faba8-d8ec-447e-9ab1-9a8fa34c1f63",[{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[9.25390625,28.5078125,97.015625,354.03125,1348.0625],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[46.2578125,142.515625,485.03125,1770.0625,6740.125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[145.703125,448.90625,1527.8125,5575.625,21231.25],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"GB memory"},showgrid:!0,gridwidth:1,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Meta-Llama-3.1-8B",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Meta-Llama-3.1-70B",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Meta-Llama-3.1-405B",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],barmode:"stack",width:1e3,height:400,legend:{title:{}}},{responsive:!0})</script> </div>
dist/fragments/pp_bubblesize.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=3997052d-7390-4347-abf3-6c8b08e53c31 class=plotly-graph-div style="height:650px; width:900px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("3997052d-7390-4347-abf3-6c8b08e53c31")&&Plotly.newPlot("3997052d-7390-4347-abf3-6c8b08e53c31",[{marker:{color:"#4ea5b7"},orientation:"h",text:["0.03","0.05","0.05","0.11","0.11","0.11","0.22","0.22","0.22","0.22","0.44","0.44","0.44","0.44","0.88","0.88","0.88","0.88","1.75","1.75","1.75","3.50","3.50","7.00"],textposition:"outside",x:[.02734375,.0546875,.0546875,.109375,.109375,.109375,.21875,.21875,.21875,.21875,.4375,.4375,.4375,.4375,.875,.875,.875,.875,1.75,1.75,1.75,3.5,3.5,7],y:["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],type:"bar"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},yaxis:{tickmode:"array",tickvals:["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],ticktext:["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],title:{text:"PP configuration"}},margin:{l:150,r:100,t:100,b:100},title:{text:"Bubble size for PP=8"},xaxis:{title:{text:"Bubble size"}},width:900,height:650},{responsive:!0})</script> </div>
dist/fragments/pp_comm_bandwidth.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=91842954-4418-43b7-bc03-81a893ff71fe class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("91842954-4418-43b7-bc03-81a893ff71fe")&&Plotly.newPlot("91842954-4418-43b7-bc03-81a893ff71fe",[{fill:"toself",fillcolor:"rgba(78,165,183,0.2)",hoverinfo:"skip",line:{color:"rgba(255,255,255,0)"},showlegend:!1,x:[0,1,2,3,4,5,6,6,5,4,3,2,1,0],y:[461.538,436.13,219.09099999999998,192.73399999999992,188.42249999999999,194.227,177.35999999999999,12.493500000000001,24.965,31.34,41.587,44.509,302.33500000000004,422.288],type:"scatter"},{line:{color:"#4ea5b7",width:3},marker:{size:10,symbol:"circle"},mode:"lines+markers+text",name:"AllReduce",text:["436.0","361.7","160.1","99.6","84.7","64.9","32.9"],textposition:"bottom center",x:[0,1,2,3,4,5,6],y:[435.9668115942029,361.74920529801324,160.13950738916256,99.56427561837455,84.74052884615384,64.92543661971831,32.937847222222224],type:"scatter"},{fill:"toself",fillcolor:"rgba(232,137,171,0.2)",hoverinfo:"skip",line:{color:"rgba(255,255,255,0)"},showlegend:!1,x:[0,1,2,3,4,5,6,6,5,4,3,2,1,0],y:[264.93,226.26999999999998,229.40999999999997,178.47899999999998,126.6575,77.026,44.4165,6.1535,12.314,24.525000000000002,47.147,40.757000000000005,147.97500000000002,239.55200000000002],type:"scatter"},{line:{color:"#e889ab",width:3},marker:{size:10,symbol:"square"},mode:"lines+markers",name:"AllGather",x:[0,1,2,3,4,5,6],y:[249.84884057971013,184.61324503311258,118.96753694581281,68.99752650176679,54.972283653846155,27.969183098591547,11.038298611111111],type:"scatter"},{fill:"toself",fillcolor:"rgba(206,192,250,0.2)",hoverinfo:"skip",line:{color:"rgba(255,255,255,0)"},showlegend:!1,x:[0,1,2,3,4,5,6,6,5,4,3,2,1,0],y:[264.64599999999996,226.37,215.492,177.54299999999998,126.4825,77.289,45.1295,6.1005,12.39,24.544999999999998,46.802,41.176,146.1,240.804],type:"scatter"},{line:{color:"#cec0fa",width:3},marker:{size:10,symbol:"triangle-up"},mode:"lines+markers",name:"ReduceScatter",x:[0,1,2,3,4,5,6],y:[249.72898550724636,181.5535761589404,115.7576354679803,68.55106007067138,54.524230769230776,27.944281690140844,11.069652777777778],type:"scatter"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{title:{text:"Number of Nodes"},tickmode:"array",tickvals:[0,1,2,3,4,5,6],ticktext:["1","2","4","8","16","32","64"]},yaxis:{title:{text:"Bandwidth (GB/s)"},range:[0,480],gridcolor:"rgba(0,0,0,0.1)"},legend:{x:.85,y:1,bgcolor:"rgba(255,255,255,0.5)"},margin:{l:80,r:80,t:80,b:80},title:{text:"Communication Bandwidth by Number of Nodes (size=256MB)"},width:1e3,height:410},{responsive:!0})</script> </div>
dist/fragments/pp_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=8db330ae-52ab-4193-8ee5-dfa289fe4609 class=plotly-graph-div style="height:410px; width:800px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("8db330ae-52ab-4193-8ee5-dfa289fe4609")&&Plotly.newPlot("8db330ae-52ab-4193-8ee5-dfa289fe4609",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[59.828125,59.828125,59.828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[2.603515625,2.603515625,2.603515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[2.603515625,2.603515625,2.603515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[10.4140625,10.4140625,10.4140625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray",title:{text:"Memory Usage (GB)"}},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"PP=8",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"}],title:{text:"Memory Usage for 8B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:800,height:410},{responsive:!0})</script> </div>
dist/fragments/tp_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=eed45cdb-8f9e-4c54-b31f-85062d3362e8 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("eed45cdb-8f9e-4c54-b31f-85062d3362e8")&&Plotly.newPlot("eed45cdb-8f9e-4c54-b31f-85062d3362e8",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[526,526,526],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[21.25,85,340],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[65.75,65.75,65.75],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[8.125,32.5,130],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[32.875,32.875,32.875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[7.1875,28.75,115],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,title:{text:"Memory Usage (GB)"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism (TP-1)",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=8",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=16",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage for 70B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:400},{responsive:!0})</script> </div>
dist/fragments/tp_scaling.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=8de1ceac-2d60-4368-8691-415db492ed15 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("8de1ceac-2d60-4368-8691-415db492ed15")&&Plotly.newPlot("8de1ceac-2d60-4368-8691-415db492ed15",[{marker:{color:"#4ea5b7"},name:"Tokens/sec/GPU",width:.7,x:["2","4","8","16","32"],y:[13923.18,12420.76,10903.32,6245.6,2146.44],type:"bar",xaxis:"x",yaxis:"y"},{base:[12420.76],marker:{color:"#e889ab"},name:"Performance Drop",showlegend:!0,width:.0875,x:["4"],y:[1502.42],type:"bar",xaxis:"x",yaxis:"y"},{base:[10903.32],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["8"],y:[1517.4400000000005],type:"bar",xaxis:"x",yaxis:"y"},{base:[6245.6],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["16"],y:[4657.719999999999],type:"bar",xaxis:"x",yaxis:"y"},{base:[2146.44],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["32"],y:[4099.16],type:"bar",xaxis:"x",yaxis:"y"},{marker:{color:"#cec0fa"},name:"Max Batch Size",text:["3","8","12","16","20"],textposition:"inside",width:.7,x:["2","4","8","16","32"],y:[3,8,12,16,20],type:"bar",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Tokens/sec/GPU"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],title:{text:"Maximum Batch Size"},showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Throughput Scaling with TP (3B Model)",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Maximum Batch Size per TP Value",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{color:"#e889ab"},showarrow:!1,text:"-10.8%",x:1,xanchor:"center",xref:"x",xshift:30,y:13171.970000000001,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-12.2%",x:2,xanchor:"center",xref:"x",xshift:30,y:11662.04,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-42.7%",x:3,xanchor:"center",xref:"x",xshift:30,y:8574.46,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-65.6%",x:4,xanchor:"center",xref:"x",xshift:30,y:4196.02,yanchor:"middle",yref:"y"}],legend:{x:.55,y:1},width:1e3,height:400,barmode:"stack"},{responsive:!0})</script> </div>
dist/fragments/tp_sp_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=16242941-9fab-40ed-996e-ddd93a5a627c class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("16242941-9fab-40ed-996e-ddd93a5a627c")&&Plotly.newPlot("16242941-9fab-40ed-996e-ddd93a5a627c",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[526,526,526],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[65.75,65.75,65.75],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[32.875,32.875,32.875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[21.25,85,340],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[2.65625,10.625,42.5],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[1.328125,5.3125,21.25],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray",title:{text:"Memory Usage (GB)"}},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=8 (with SP)",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=16 (with SP)",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage for 70B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:410},{responsive:!0})</script> </div>
dist/fragments/tp_sp_scaling.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=e996eac5-dceb-42af-9cc6-d21f247621f5 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("e996eac5-dceb-42af-9cc6-d21f247621f5")&&Plotly.newPlot("e996eac5-dceb-42af-9cc6-d21f247621f5",[{marker:{color:"#4ea5b7"},name:"Tokens/sec/GPU",width:.7,x:["2","4","8","16","32"],y:[14167.25,13460.16,10888.53,6159.3,3609.73],type:"bar",xaxis:"x",yaxis:"y"},{base:[13460.16],marker:{color:"#e889ab"},name:"Performance Drop",showlegend:!0,width:.0875,x:["4"],y:[707.0900000000001],type:"bar",xaxis:"x",yaxis:"y"},{base:[10888.53],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["8"],y:[2571.629999999999],type:"bar",xaxis:"x",yaxis:"y"},{base:[6159.3],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["16"],y:[4729.2300000000005],type:"bar",xaxis:"x",yaxis:"y"},{base:[3609.73],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["32"],y:[2549.57],type:"bar",xaxis:"x",yaxis:"y"},{marker:{color:"#cec0fa"},name:"Max Batch Size",text:["4","10","20","40","100"],textposition:"inside",width:.7,x:["2","4","8","16","32"],y:[4,10,20,40,100],type:"bar",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Tokens/sec/GPU"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],title:{text:"Maximum Batch Size"},showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Throughput Scaling with TP/SP (3B Model)",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Maximum Batch Size per TP Value",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{color:"#e889ab"},showarrow:!1,text:"-5.0%",x:1,xanchor:"center",xref:"x",xshift:30,y:13813.705,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-19.1%",x:2,xanchor:"center",xref:"x",xshift:30,y:12174.345000000001,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-43.4%",x:3,xanchor:"center",xref:"x",xshift:30,y:8523.915,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-41.4%",x:4,xanchor:"center",xref:"x",xshift:30,y:4884.515,yanchor:"middle",yref:"y"}],legend:{x:.55,y:1},width:1e3,height:400,barmode:"stack"},{responsive:!0})</script> </div>
dist/fragments/zero3_memoryusage.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id=be6d1423-335a-45b6-a848-dc4d0d70706f class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("be6d1423-335a-45b6-a848-dc4d0d70706f")&&Plotly.newPlot("be6d1423-335a-45b6-a848-dc4d0d70706f",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[60,60,60],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[7.5,7.5,7.5],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[1.875,1.875,1.875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[7.5,7.5,7.5],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[1.875,1.875,1.875],type:"bar",xaxis:"x4",yaxis:"y4"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[1.875,1.875,1.875],type:"bar",xaxis:"x4",yaxis:"y4"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[7.5,7.5,7.5],type:"bar",xaxis:"x4",yaxis:"y4"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x4",yaxis:"y4"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2125],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Memory Usage (GB)"},dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.2625,.475],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.525,.7375],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,showgrid:!0,gridcolor:"LightGray"},xaxis4:{anchor:"y4",domain:[.7875,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis4:{anchor:"x4",domain:[0,1],matches:"y",showticklabels:!1,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"DP=8",x:.10625,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"DP=8 Zero-1",x:.36875,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"DP=8 Zero-2",x:.6312500000000001,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"DP=8 Zero-3",x:.89375,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x4 domain",y0:80,y1:80,yref:"y4"}],title:{text:"Memory Usage for 8B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:400},{responsive:!0})</script> </div>
dist/index.html CHANGED
@@ -208,7 +208,6 @@
208
 
209
  <!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
210
  <div id="fragment-benchmarks_interactive"></div>
211
- <!-- -->
212
 
213
  <p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
214
 
@@ -339,7 +338,8 @@
339
  <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
340
  <script src="../assets/images/first_steps_memory_profile.js"></script>
341
  -->
342
-
 
343
 
344
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
345
 
@@ -460,14 +460,14 @@
460
 
461
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
462
 
463
- <iframe class="l-body-outset" id="plotFrame3" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
464
- <script>
465
  window.addEventListener('load', function() {
466
  const frame = document.getElementById('plotFrame3');
467
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
468
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
469
  });
470
- </script>
471
 
472
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
473
 
@@ -495,7 +495,7 @@
495
 
496
  <p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
497
 
498
-
499
 
500
  <p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
501
 
@@ -734,14 +734,14 @@
734
  <p>Lets see this happening in practice with some benchmark:</p>
735
 
736
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
737
- <iframe class="l-body-outset" id="plotFrame4" src="assets/data/benchmarks/dp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
738
- <script>
739
  window.addEventListener('load', function() {
740
  const frame = document.getElementById('plotFrame4');
741
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
742
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
743
  });
744
- </script>
745
 
746
  <p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
747
 
@@ -750,14 +750,15 @@
750
  <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
751
  <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
752
 
753
- <iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
754
- <script>
 
755
  window.addEventListener('load', function() {
756
  const frame = document.getElementById('plotFrame5');
757
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
758
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
759
  });
760
- </script>
761
  <!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
762
 
763
 
@@ -907,14 +908,15 @@
907
 
908
  <p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
909
 
910
- <iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
911
- <script>
 
912
  window.addEventListener('load', function() {
913
  const frame = document.getElementById('plotFrame6');
914
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
915
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
916
  });
917
- </script>
918
  <!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
919
 
920
  <p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
@@ -1013,14 +1015,15 @@
1013
 
1014
  <p> Let's take a better look at the trade-off as we scale the TP degree:</p>
1015
 
1016
- <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
1017
- <script>
 
1018
  window.addEventListener('load', function() {
1019
  const frame = document.getElementById('plotFrame13');
1020
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1021
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1022
  });
1023
- </script>
1024
  <!--
1025
  <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
1026
 
@@ -1030,14 +1033,15 @@
1030
 
1031
  <p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
1032
 
1033
- <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1034
- <script>
 
1035
  window.addEventListener('load', function() {
1036
  const frame = document.getElementById('plotFrame7');
1037
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1038
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1039
- });
1040
- </script>
1041
  <!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
1042
 
1043
  <p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
@@ -1194,14 +1198,15 @@
1194
 
1195
  <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
1196
 
1197
- <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1198
- <script>
 
1199
  window.addEventListener('load', function() {
1200
  const frame = document.getElementById('plotFrame8');
1201
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1202
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1203
- });
1204
- </script>
1205
  <!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
1206
 
1207
  <p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
@@ -1216,14 +1221,15 @@
1216
 
1217
  <p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
1218
 
1219
- <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
1220
- <script>
 
1221
  window.addEventListener('load', function() {
1222
  const frame = document.getElementById('plotFrame2');
1223
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1224
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1225
- });
1226
- </script>
1227
 
1228
  <!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
1229
  <p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
@@ -1255,14 +1261,15 @@
1255
 
1256
  <p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
1257
 
1258
- <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1259
- <script>
 
1260
  window.addEventListener('load', function() {
1261
  const frame = document.getElementById('plotFrame9');
1262
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1263
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1264
  });
1265
- </script>
1266
 
1267
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1268
 
@@ -1356,14 +1363,15 @@
1356
 
1357
  <p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
1358
 
1359
- <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe>
1360
- <script>
 
1361
  window.addEventListener('load', function() {
1362
  const frame = document.getElementById('plotFrame11');
1363
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1364
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1365
  });
1366
- </script>
1367
 
1368
  <!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
1369
  <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
@@ -1374,14 +1382,15 @@
1374
 
1375
  <aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
1376
 
1377
- <iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1378
- <script>
 
1379
  window.addEventListener('load', function() {
1380
  const frame = document.getElementById('plotFrame12');
1381
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1382
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1383
  });
1384
- </script>
1385
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1386
 
1387
  <p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
@@ -1507,14 +1516,15 @@
1507
 
1508
  <p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
1509
 
1510
- <iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe>
1511
- <script>
 
1512
  window.addEventListener('load', function() {
1513
  const frame = document.getElementById('plotFrame23');
1514
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1515
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1516
  });
1517
- </script>
1518
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1519
 
1520
 
@@ -2312,6 +2322,8 @@
2312
  <p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
2313
 
2314
  <iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
 
 
2315
 
2316
  <p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
2317
 
@@ -2730,8 +2742,7 @@
2730
  <div>
2731
  <a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
2732
  <p>Easy explanation of Flash Attention</p>
2733
- </div>
2734
-
2735
  <div>
2736
  <a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
2737
  <p>Large-scale language modeling tutorials with PyTorch.</p>
@@ -3242,4 +3253,4 @@
3242
 
3243
  </body>
3244
 
3245
- </html>
 
208
 
209
  <!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
210
  <div id="fragment-benchmarks_interactive"></div>
 
211
 
212
  <p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
213
 
 
338
  <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
339
  <script src="../assets/images/first_steps_memory_profile.js"></script>
340
  -->
341
+ <!-- -->
342
+ <div id="fragment-memory-profile"></div>
343
 
344
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
345
 
 
460
 
461
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
462
 
463
+ <div class="l-body-outset" id="fragment-memusage_activations"></div>
464
+ <!-- <script>
465
  window.addEventListener('load', function() {
466
  const frame = document.getElementById('plotFrame3');
467
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
468
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
469
  });
470
+ </script> -->
471
 
472
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
473
 
 
495
 
496
  <p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
497
 
498
+ <div id="fragment-memory-recomputation"></div>
499
 
500
  <p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
501
 
 
734
  <p>Lets see this happening in practice with some benchmark:</p>
735
 
736
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
737
+ <div class="l-body-outset" id="fragment-dp_scaling"></div>
738
+ <!-- <script>
739
  window.addEventListener('load', function() {
740
  const frame = document.getElementById('plotFrame4');
741
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
742
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
743
  });
744
+ </script> -->
745
 
746
  <p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
747
 
 
750
  <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
751
  <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
752
 
753
+ <!-- <iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
754
+ <div class="l-body-outset" id="fragment-dp_ourjourney_memoryusage"></div>
755
+ <!-- <script>
756
  window.addEventListener('load', function() {
757
  const frame = document.getElementById('plotFrame5');
758
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
759
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
760
  });
761
+ </script> -->
762
  <!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
763
 
764
 
 
908
 
909
  <p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
910
 
911
+ <!-- <iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
912
+ <div class="l-body-outset" id="fragment-zero3_memoryusage"></div>
913
+ <!-- <script>
914
  window.addEventListener('load', function() {
915
  const frame = document.getElementById('plotFrame6');
916
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
917
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
918
  });
919
+ </script> -->
920
  <!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
921
 
922
  <p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
 
1015
 
1016
  <p> Let's take a better look at the trade-off as we scale the TP degree:</p>
1017
 
1018
+ <!-- <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1019
+ <div class="l-body-outset" id="fragment-tp_scaling"></div>
1020
+ <!-- <script>
1021
  window.addEventListener('load', function() {
1022
  const frame = document.getElementById('plotFrame13');
1023
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1024
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1025
  });
1026
+ </script> -->
1027
  <!--
1028
  <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
1029
 
 
1033
 
1034
  <p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
1035
 
1036
+ <!-- <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1037
+ <div class="l-body-outset" id="fragment-tp_memoryusage"></div>
1038
+ <!-- <script>
1039
  window.addEventListener('load', function() {
1040
  const frame = document.getElementById('plotFrame7');
1041
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1042
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1043
+ }); -->
1044
+ <!-- </script> -->
1045
  <!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
1046
 
1047
  <p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
 
1198
 
1199
  <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
1200
 
1201
+ <!-- <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1202
+ <div class="l-body-outset" id="fragment-tp_sp_memoryusage"></div>
1203
+ <!-- <script>
1204
  window.addEventListener('load', function() {
1205
  const frame = document.getElementById('plotFrame8');
1206
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1207
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1208
+ }); -->
1209
+ <!-- </script> -->
1210
  <!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
1211
 
1212
  <p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
 
1221
 
1222
  <p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
1223
 
1224
+ <!-- <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1225
+ <div class="l-body-outset" id="fragment-tp_sp_scaling"></div>
1226
+ <!-- <script>
1227
  window.addEventListener('load', function() {
1228
  const frame = document.getElementById('plotFrame2');
1229
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1230
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1231
+ }); -->
1232
+ <!-- </script> -->
1233
 
1234
  <!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
1235
  <p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
 
1261
 
1262
  <p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
1263
 
1264
+ <!-- <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1265
+ <div class="l-body-outset" id="fragment-cp_8Bmemoryusage"></div>
1266
+ <!-- <script>
1267
  window.addEventListener('load', function() {
1268
  const frame = document.getElementById('plotFrame9');
1269
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1270
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1271
  });
1272
+ </script> -->
1273
 
1274
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1275
 
 
1363
 
1364
  <p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
1365
 
1366
+ <!-- <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1367
+ <div class="l-body-outset" id="fragment-pp_comm_bandwidth"></div>
1368
+ <!-- <script>
1369
  window.addEventListener('load', function() {
1370
  const frame = document.getElementById('plotFrame11');
1371
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1372
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1373
  });
1374
+ </script> -->
1375
 
1376
  <!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
1377
  <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
 
1382
 
1383
  <aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
1384
 
1385
+ <!-- <iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1386
+ <div class="l-body" id="fragment-pp_memoryusage"></div>
1387
+ <!-- <script>
1388
  window.addEventListener('load', function() {
1389
  const frame = document.getElementById('plotFrame12');
1390
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1391
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1392
  });
1393
+ </script> -->
1394
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1395
 
1396
  <p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
 
1516
 
1517
  <p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
1518
 
1519
+ <!-- <iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1520
+ <div class="l-body" id="fragment-pp_bubblesize"></div>
1521
+ <!-- <script>
1522
  window.addEventListener('load', function() {
1523
  const frame = document.getElementById('plotFrame23');
1524
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1525
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1526
  });
1527
+ </script> -->
1528
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1529
 
1530
 
 
2322
  <p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
2323
 
2324
  <iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
2325
+ <!-- Hynek uncomment this once it's added to -->
2326
+ <!-- <div class="l-body-outset" id="fragment-fp8_training_loss_curves"></div> -->
2327
 
2328
  <p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
2329
 
 
2742
  <div>
2743
  <a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
2744
  <p>Easy explanation of Flash Attention</p>
2745
+ </div>
 
2746
  <div>
2747
  <a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
2748
  <p>Large-scale language modeling tutorials with PyTorch.</p>
 
3253
 
3254
  </body>
3255
 
3256
+ </html>
src/index.html CHANGED
@@ -208,7 +208,6 @@
208
 
209
  <!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
210
  <div id="fragment-benchmarks_interactive"></div>
211
- <!-- {{{benchmarks-interactive-html}}} -->
212
 
213
  <p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
214
 
@@ -339,8 +338,8 @@
339
  <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
340
  <script src="../assets/images/first_steps_memory_profile.js"></script>
341
  -->
342
- {{!-- <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe> --}}
343
- {{{memory-profile-html}}}
344
 
345
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
346
 
@@ -461,14 +460,14 @@
461
 
462
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
463
 
464
- <iframe class="l-body-outset" id="plotFrame3" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
465
- <script>
466
  window.addEventListener('load', function() {
467
  const frame = document.getElementById('plotFrame3');
468
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
469
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
470
  });
471
- </script>
472
 
473
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
474
 
@@ -496,7 +495,7 @@
496
 
497
  <p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
498
 
499
- {{{memory-recomputation-html}}}
500
 
501
  <p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
502
 
@@ -735,14 +734,14 @@
735
  <p>Lets see this happening in practice with some benchmark:</p>
736
 
737
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
738
- <iframe class="l-body-outset" id="plotFrame4" src="assets/data/benchmarks/dp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
739
- <script>
740
  window.addEventListener('load', function() {
741
  const frame = document.getElementById('plotFrame4');
742
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
743
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
744
  });
745
- </script>
746
 
747
  <p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
748
 
@@ -751,14 +750,15 @@
751
  <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
752
  <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
753
 
754
- <iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
755
- <script>
 
756
  window.addEventListener('load', function() {
757
  const frame = document.getElementById('plotFrame5');
758
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
759
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
760
  });
761
- </script>
762
  <!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
763
 
764
 
@@ -908,14 +908,15 @@
908
 
909
  <p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
910
 
911
- <iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
912
- <script>
 
913
  window.addEventListener('load', function() {
914
  const frame = document.getElementById('plotFrame6');
915
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
916
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
917
  });
918
- </script>
919
  <!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
920
 
921
  <p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
@@ -1014,14 +1015,15 @@
1014
 
1015
  <p> Let's take a better look at the trade-off as we scale the TP degree:</p>
1016
 
1017
- <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
1018
- <script>
 
1019
  window.addEventListener('load', function() {
1020
  const frame = document.getElementById('plotFrame13');
1021
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1022
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1023
  });
1024
- </script>
1025
  <!--
1026
  <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
1027
 
@@ -1031,14 +1033,15 @@
1031
 
1032
  <p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
1033
 
1034
- <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1035
- <script>
 
1036
  window.addEventListener('load', function() {
1037
  const frame = document.getElementById('plotFrame7');
1038
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1039
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1040
- });
1041
- </script>
1042
  <!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
1043
 
1044
  <p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
@@ -1195,14 +1198,15 @@
1195
 
1196
  <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
1197
 
1198
- <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1199
- <script>
 
1200
  window.addEventListener('load', function() {
1201
  const frame = document.getElementById('plotFrame8');
1202
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1203
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1204
- });
1205
- </script>
1206
  <!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
1207
 
1208
  <p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
@@ -1217,14 +1221,15 @@
1217
 
1218
  <p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
1219
 
1220
- <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
1221
- <script>
 
1222
  window.addEventListener('load', function() {
1223
  const frame = document.getElementById('plotFrame2');
1224
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1225
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1226
- });
1227
- </script>
1228
 
1229
  <!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
1230
  <p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
@@ -1256,14 +1261,15 @@
1256
 
1257
  <p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
1258
 
1259
- <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1260
- <script>
 
1261
  window.addEventListener('load', function() {
1262
  const frame = document.getElementById('plotFrame9');
1263
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1264
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1265
  });
1266
- </script>
1267
 
1268
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1269
 
@@ -1357,14 +1363,15 @@
1357
 
1358
  <p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
1359
 
1360
- <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe>
1361
- <script>
 
1362
  window.addEventListener('load', function() {
1363
  const frame = document.getElementById('plotFrame11');
1364
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1365
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1366
  });
1367
- </script>
1368
 
1369
  <!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
1370
  <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
@@ -1375,14 +1382,15 @@
1375
 
1376
  <aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
1377
 
1378
- <iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
1379
- <script>
 
1380
  window.addEventListener('load', function() {
1381
  const frame = document.getElementById('plotFrame12');
1382
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1383
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1384
  });
1385
- </script>
1386
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1387
 
1388
  <p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
@@ -1508,14 +1516,15 @@
1508
 
1509
  <p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
1510
 
1511
- <iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe>
1512
- <script>
 
1513
  window.addEventListener('load', function() {
1514
  const frame = document.getElementById('plotFrame23');
1515
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1516
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1517
  });
1518
- </script>
1519
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1520
 
1521
 
@@ -2313,6 +2322,8 @@
2313
  <p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
2314
 
2315
  <iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
 
 
2316
 
2317
  <p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
2318
 
@@ -2731,8 +2742,7 @@
2731
  <div>
2732
  <a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
2733
  <p>Easy explanation of Flash Attention</p>
2734
- </div>
2735
-
2736
  <div>
2737
  <a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
2738
  <p>Large-scale language modeling tutorials with PyTorch.</p>
@@ -3243,4 +3253,4 @@
3243
 
3244
  </body>
3245
 
3246
- </html>
 
208
 
209
  <!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
210
  <div id="fragment-benchmarks_interactive"></div>
 
211
 
212
  <p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
213
 
 
338
  <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
339
  <script src="../assets/images/first_steps_memory_profile.js"></script>
340
  -->
341
+ <!-- {{!-- <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe> --}} -->
342
+ <div id="fragment-memory-profile"></div>
343
 
344
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
345
 
 
460
 
461
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
462
 
463
+ <div class="l-body-outset" id="fragment-memusage_activations"></div>
464
+ <!-- <script>
465
  window.addEventListener('load', function() {
466
  const frame = document.getElementById('plotFrame3');
467
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
468
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
469
  });
470
+ </script> -->
471
 
472
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
473
 
 
495
 
496
  <p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
497
 
498
+ <div id="fragment-memory-recomputation"></div>
499
 
500
  <p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
501
 
 
734
  <p>Lets see this happening in practice with some benchmark:</p>
735
 
736
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
737
+ <div class="l-body-outset" id="fragment-dp_scaling"></div>
738
+ <!-- <script>
739
  window.addEventListener('load', function() {
740
  const frame = document.getElementById('plotFrame4');
741
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
742
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
743
  });
744
+ </script> -->
745
 
746
  <p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
747
 
 
750
  <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
751
  <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
752
 
753
+ <!-- <iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
754
+ <div class="l-body-outset" id="fragment-dp_ourjourney_memoryusage"></div>
755
+ <!-- <script>
756
  window.addEventListener('load', function() {
757
  const frame = document.getElementById('plotFrame5');
758
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
759
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
760
  });
761
+ </script> -->
762
  <!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
763
 
764
 
 
908
 
909
  <p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
910
 
911
+ <!-- <iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
912
+ <div class="l-body-outset" id="fragment-zero3_memoryusage"></div>
913
+ <!-- <script>
914
  window.addEventListener('load', function() {
915
  const frame = document.getElementById('plotFrame6');
916
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
917
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
918
  });
919
+ </script> -->
920
  <!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
921
 
922
  <p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
 
1015
 
1016
  <p> Let's take a better look at the trade-off as we scale the TP degree:</p>
1017
 
1018
+ <!-- <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1019
+ <div class="l-body-outset" id="fragment-tp_scaling"></div>
1020
+ <!-- <script>
1021
  window.addEventListener('load', function() {
1022
  const frame = document.getElementById('plotFrame13');
1023
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1024
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1025
  });
1026
+ </script> -->
1027
  <!--
1028
  <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
1029
 
 
1033
 
1034
  <p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
1035
 
1036
+ <!-- <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1037
+ <div class="l-body-outset" id="fragment-tp_memoryusage"></div>
1038
+ <!-- <script>
1039
  window.addEventListener('load', function() {
1040
  const frame = document.getElementById('plotFrame7');
1041
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1042
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1043
+ }); -->
1044
+ <!-- </script> -->
1045
  <!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
1046
 
1047
  <p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
 
1198
 
1199
  <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
1200
 
1201
+ <!-- <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1202
+ <div class="l-body-outset" id="fragment-tp_sp_memoryusage"></div>
1203
+ <!-- <script>
1204
  window.addEventListener('load', function() {
1205
  const frame = document.getElementById('plotFrame8');
1206
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1207
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1208
+ }); -->
1209
+ <!-- </script> -->
1210
  <!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
1211
 
1212
  <p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
 
1221
 
1222
  <p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
1223
 
1224
+ <!-- <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1225
+ <div class="l-body-outset" id="fragment-tp_sp_scaling"></div>
1226
+ <!-- <script>
1227
  window.addEventListener('load', function() {
1228
  const frame = document.getElementById('plotFrame2');
1229
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1230
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1231
+ }); -->
1232
+ <!-- </script> -->
1233
 
1234
  <!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
1235
  <p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
 
1261
 
1262
  <p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
1263
 
1264
+ <!-- <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1265
+ <div class="l-body-outset" id="fragment-cp_8Bmemoryusage"></div>
1266
+ <!-- <script>
1267
  window.addEventListener('load', function() {
1268
  const frame = document.getElementById('plotFrame9');
1269
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1270
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1271
  });
1272
+ </script> -->
1273
 
1274
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1275
 
 
1363
 
1364
  <p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
1365
 
1366
+ <!-- <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1367
+ <div class="l-body-outset" id="fragment-pp_comm_bandwidth"></div>
1368
+ <!-- <script>
1369
  window.addEventListener('load', function() {
1370
  const frame = document.getElementById('plotFrame11');
1371
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1372
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1373
  });
1374
+ </script> -->
1375
 
1376
  <!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
1377
  <p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
 
1382
 
1383
  <aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
1384
 
1385
+ <!-- <iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1386
+ <div class="l-body" id="fragment-pp_memoryusage"></div>
1387
+ <!-- <script>
1388
  window.addEventListener('load', function() {
1389
  const frame = document.getElementById('plotFrame12');
1390
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1391
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1392
  });
1393
+ </script> -->
1394
  <!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
1395
 
1396
  <p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
 
1516
 
1517
  <p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
1518
 
1519
+ <!-- <iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
1520
+ <div class="l-body" id="fragment-pp_bubblesize"></div>
1521
+ <!-- <script>
1522
  window.addEventListener('load', function() {
1523
  const frame = document.getElementById('plotFrame23');
1524
  frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
1525
  frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
1526
  });
1527
+ </script> -->
1528
  <!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
1529
 
1530
 
 
2322
  <p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
2323
 
2324
  <iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
2325
+ <!-- Hynek uncomment this once it's added to -->
2326
+ <!-- <div class="l-body-outset" id="fragment-fp8_training_loss_curves"></div> -->
2327
 
2328
  <p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
2329
 
 
2742
  <div>
2743
  <a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
2744
  <p>Easy explanation of Flash Attention</p>
2745
+ </div>
 
2746
  <div>
2747
  <a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
2748
  <p>Large-scale language modeling tutorials with PyTorch.</p>
 
3253
 
3254
  </body>
3255
 
3256
+ </html>
webpack.config.js CHANGED
@@ -130,6 +130,14 @@ module.exports = {
130
  },
131
  {
132
  implementation: ImageMinimizerPlugin.svgoMinify,
 
 
 
 
 
 
 
 
133
  }
134
  ]
135
  }),
 
130
  },
131
  {
132
  implementation: ImageMinimizerPlugin.svgoMinify,
133
+ options: {
134
+ encodeOptions: {
135
+ multipass: true,
136
+ plugins: [
137
+ 'preset-default',
138
+ ]
139
+ }
140
+ }
141
  }
142
  ]
143
  }),