Spaces:
Running
Running
add dist + nits
Browse files- dist/assets/data/fp8/.DS_Store +0 -0
- dist/assets/images/5D_nutshell_tp_sp.svg +1 -1
- dist/assets/images/5d_full.svg +0 -0
- dist/assets/images/5d_nutshell_cp.svg +1 -1
- dist/assets/images/5d_nutshell_ep.svg +0 -0
- dist/assets/images/activation_recomputation.svg +1 -1
- dist/assets/images/cp_70bmemoryusage.svg +0 -0
- dist/assets/images/cp_attnmask.svg +0 -0
- dist/assets/images/cp_overlap_all2all.svg +1 -1
- dist/assets/images/cp_overlap_allgather.svg +1 -1
- dist/assets/images/cp_zigzagmask.svg +0 -0
- dist/assets/images/diving_primergpu.svg +0 -0
- dist/assets/images/diving_primergpu2.svg +0 -0
- dist/assets/images/dp_overlap1.svg +1 -1
- dist/assets/images/dp_overlap2.svg +1 -1
- dist/assets/images/dp_overlap3.svg +1 -1
- dist/assets/images/dp_zero1_overlap.svg +1 -1
- dist/assets/images/dp_zero2_overlap.svg +1 -1
- dist/assets/images/dp_zero3_bwd.svg +0 -0
- dist/assets/images/dp_zero3_fwd.svg +0 -0
- dist/assets/images/dp_zero3_overlap.svg +1 -1
- dist/assets/images/first_steps_memory_profile.svg +0 -0
- dist/assets/images/memory_profile.svg +0 -0
- dist/assets/images/pp_1f1b.svg +0 -0
- dist/assets/images/pp_1f1b_interleaved.svg +0 -0
- dist/assets/images/pp_afab.svg +1 -1
- dist/assets/images/pp_afab2.svg +0 -0
- dist/assets/images/tp_diagram.svg +1 -1
- dist/assets/images/tp_overlap.svg +1 -1
- dist/assets/images/tp_sp_overlap.svg +1 -1
- dist/assets/images/ultra-cheatsheet.svg +0 -0
- dist/assets/images/zero_memory.svg +0 -0
- dist/fragments/cp_8Bmemoryusage.html +1 -0
- dist/fragments/dp_ourjourney_memoryusage.html +1 -0
- dist/fragments/dp_scaling.html +1 -0
- dist/fragments/memusage_activations.html +1 -0
- dist/fragments/pp_bubblesize.html +1 -0
- dist/fragments/pp_comm_bandwidth.html +1 -0
- dist/fragments/pp_memoryusage.html +1 -0
- dist/fragments/tp_memoryusage.html +1 -0
- dist/fragments/tp_scaling.html +1 -0
- dist/fragments/tp_sp_memoryusage.html +1 -0
- dist/fragments/tp_sp_scaling.html +1 -0
- dist/fragments/zero3_memoryusage.html +1 -0
- dist/index.html +56 -45
- src/index.html +56 -46
- webpack.config.js +8 -0
dist/assets/data/fp8/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
dist/assets/images/5D_nutshell_tp_sp.svg
CHANGED
|
|
dist/assets/images/5d_full.svg
CHANGED
|
|
dist/assets/images/5d_nutshell_cp.svg
CHANGED
|
|
dist/assets/images/5d_nutshell_ep.svg
CHANGED
|
|
dist/assets/images/activation_recomputation.svg
CHANGED
|
|
dist/assets/images/cp_70bmemoryusage.svg
CHANGED
|
|
dist/assets/images/cp_attnmask.svg
CHANGED
|
|
dist/assets/images/cp_overlap_all2all.svg
CHANGED
|
|
dist/assets/images/cp_overlap_allgather.svg
CHANGED
|
|
dist/assets/images/cp_zigzagmask.svg
CHANGED
|
|
dist/assets/images/diving_primergpu.svg
CHANGED
|
|
dist/assets/images/diving_primergpu2.svg
CHANGED
|
|
dist/assets/images/dp_overlap1.svg
CHANGED
|
|
dist/assets/images/dp_overlap2.svg
CHANGED
|
|
dist/assets/images/dp_overlap3.svg
CHANGED
|
|
dist/assets/images/dp_zero1_overlap.svg
CHANGED
|
|
dist/assets/images/dp_zero2_overlap.svg
CHANGED
|
|
dist/assets/images/dp_zero3_bwd.svg
CHANGED
|
|
dist/assets/images/dp_zero3_fwd.svg
CHANGED
|
|
dist/assets/images/dp_zero3_overlap.svg
CHANGED
|
|
dist/assets/images/first_steps_memory_profile.svg
CHANGED
|
|
dist/assets/images/memory_profile.svg
CHANGED
|
|
dist/assets/images/pp_1f1b.svg
CHANGED
|
|
dist/assets/images/pp_1f1b_interleaved.svg
CHANGED
|
|
dist/assets/images/pp_afab.svg
CHANGED
|
|
dist/assets/images/pp_afab2.svg
CHANGED
|
|
dist/assets/images/tp_diagram.svg
CHANGED
|
|
dist/assets/images/tp_overlap.svg
CHANGED
|
|
dist/assets/images/tp_sp_overlap.svg
CHANGED
|
|
dist/assets/images/ultra-cheatsheet.svg
CHANGED
|
|
dist/assets/images/zero_memory.svg
CHANGED
|
|
dist/fragments/cp_8Bmemoryusage.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=995b326f-ddf4-41a8-b9e5-fce1da175b3f class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("995b326f-ddf4-41a8-b9e5-fce1da175b3f")&&Plotly.newPlot("995b326f-ddf4-41a8-b9e5-fce1da175b3f",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[59.828125,59.828125,59.828125,59.828125,59.828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384","65536","131072"],y:[4.25,17,68,272,544],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[2.75,11,44,176,352],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[7.478515625,7.478515625,7.478515625,7.478515625,7.478515625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[29.9140625,29.9140625,29.9140625,29.9140625,29.9140625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384","65536","131072"],y:[.6875,2.75,11,44,88],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray",title:{text:"Memory Usage (GB)"}},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=2 CP=1",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=2 CP=4",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage for 8B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:410},{responsive:!0})</script> </div>
|
dist/fragments/dp_ourjourney_memoryusage.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=9f507a17-fb27-4b9a-9224-34ffad9cd0d4 class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("9f507a17-fb27-4b9a-9224-34ffad9cd0d4")&&Plotly.newPlot("9f507a17-fb27-4b9a-9224-34ffad9cd0d4",[{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[2.3017578125,2.3017578125,2.3017578125,2.3017578125,2.3017578125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[2.3017578125,2.3017578125,2.3017578125,2.3017578125,2.3017578125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[9.20703125,9.20703125,9.20703125,9.20703125,9.20703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[1.0625,2.125,4.25,8.5,17],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[14.95703125,14.95703125,14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[59.828125,59.828125,59.828125,59.828125,59.828125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[4.25,8.5,17,34,68],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[131.4140625,131.4140625,131.4140625,131.4140625,131.4140625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[131.4140625,131.4140625,131.4140625,131.4140625,131.4140625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[525.65625,525.65625,525.65625,525.65625,525.65625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[21.25,42.5,85,170,340],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889]},yaxis:{anchor:"x",domain:[0,1],range:[0,150],title:{text:"GB memory"}},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445]},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150]},xaxis3:{anchor:"y3",domain:[.7111111111111111,1]},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150]},annotations:[{font:{size:16},showarrow:!1,text:"1B model",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"8B model",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"70B model",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage vs Sequence Length for Different Model Sizes"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:410},{responsive:!0})</script> </div>
|
dist/fragments/dp_scaling.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=f6b00dd8-6230-46cf-9b38-f7fd425a1dd3 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("f6b00dd8-6230-46cf-9b38-f7fd425a1dd3")&&Plotly.newPlot("f6b00dd8-6230-46cf-9b38-f7fd425a1dd3",[{marker:{color:"#4ea5b7"},name:"Throughput (tokens/sec/GPU)",width:.7,x:["8","16","32","64","128","256"],y:[40149.94,37609.69,35367.61,31112.23,26446.44,15700.38],type:"bar",xaxis:"x",yaxis:"y"},{base:[37609.69],marker:{color:"#e889ab"},name:"Performance Drop",showlegend:!0,width:.0875,x:["16"],y:[2540.25],type:"bar",xaxis:"x",yaxis:"y"},{base:[35367.61],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["32"],y:[2242.0800000000017],type:"bar",xaxis:"x",yaxis:"y"},{base:[31112.23],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["64"],y:[4255.380000000001],type:"bar",xaxis:"x",yaxis:"y"},{base:[26446.44],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["128"],y:[4665.790000000001],type:"bar",xaxis:"x",yaxis:"y"},{base:[15700.38],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["256"],y:[10746.06],type:"bar",xaxis:"x",yaxis:"y"},{line:{color:"#e889ab"},marker:{color:"#e889ab"},mode:"lines+markers",name:"Memory Usage (GB)",x:["8","16","32","64","128","256"],y:[36.66,36.66,36.66,36.66,36.66,36.66],type:"scatter",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Data Parallelism (DP)"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Throughput (tokens/sec/GPU)"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Data Parallelism (DP)"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],title:{text:"Memory Usage (GB)"},showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Throughput Scaling with Data Parallelism",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Memory Usage Scaling with Data Parallelism",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{color:"#e889ab"},showarrow:!1,text:"-6.3%",x:1,xanchor:"center",xref:"x",xshift:30,y:38879.815,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-6.0%",x:2,xanchor:"center",xref:"x",xshift:30,y:36488.65,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-12.0%",x:3,xanchor:"center",xref:"x",xshift:30,y:33239.92,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-15.0%",x:4,xanchor:"center",xref:"x",xshift:30,y:28779.335,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-40.6%",x:5,xanchor:"center",xref:"x",xshift:30,y:21073.41,yanchor:"middle",yref:"y"}],legend:{x:.55,y:1},width:1e3,height:400,barmode:"stack"},{responsive:!0})</script> </div>
|
dist/fragments/memusage_activations.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=121faba8-d8ec-447e-9ab1-9a8fa34c1f63 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("121faba8-d8ec-447e-9ab1-9a8fa34c1f63")&&Plotly.newPlot("121faba8-d8ec-447e-9ab1-9a8fa34c1f63",[{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!0,x:["1024","2048","4096","8192","16384"],y:[9.25390625,28.5078125,97.015625,354.03125,1348.0625],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[46.2578125,142.515625,485.03125,1770.0625,6740.125],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"parameters",marker:{color:"#4ea5b7"},name:"parameters",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"gradients",marker:{color:"#e889ab"},name:"gradients",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"optimizer states",marker:{color:"#cec0fa"},name:"optimizer states",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"activations",marker:{color:"#e38a42"},name:"activations",showlegend:!1,x:["1024","2048","4096","8192","16384"],y:[145.703125,448.90625,1527.8125,5575.625,21231.25],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"GB memory"},showgrid:!0,gridwidth:1,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],showgrid:!0,gridwidth:1,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Meta-Llama-3.1-8B",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Meta-Llama-3.1-70B",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Meta-Llama-3.1-405B",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],barmode:"stack",width:1e3,height:400,legend:{title:{}}},{responsive:!0})</script> </div>
|
dist/fragments/pp_bubblesize.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=3997052d-7390-4347-abf3-6c8b08e53c31 class=plotly-graph-div style="height:650px; width:900px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("3997052d-7390-4347-abf3-6c8b08e53c31")&&Plotly.newPlot("3997052d-7390-4347-abf3-6c8b08e53c31",[{marker:{color:"#4ea5b7"},orientation:"h",text:["0.03","0.05","0.05","0.11","0.11","0.11","0.22","0.22","0.22","0.22","0.44","0.44","0.44","0.44","0.88","0.88","0.88","0.88","1.75","1.75","1.75","3.50","3.50","7.00"],textposition:"outside",x:[.02734375,.0546875,.0546875,.109375,.109375,.109375,.21875,.21875,.21875,.21875,.4375,.4375,.4375,.4375,.875,.875,.875,.875,1.75,1.75,1.75,3.5,3.5,7],y:["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],type:"bar"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},yaxis:{tickmode:"array",tickvals:["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],ticktext:["m=32, v=8","m=16, v=8","m=32, v=4","m=32, v=2","m=16, v=4","m=8, v=8","m=32, v=1","m=16, v=2","m=8, v=4","m=4, v=8","m=4, v=4","m=8, v=2","m=2, v=8","m=16, v=1","m=8, v=1","m=2, v=4","m=1, v=8","m=4, v=2","m=4, v=1","m=2, v=2","m=1, v=4","m=2, v=1","m=1, v=2","m=1, v=1"],title:{text:"PP configuration"}},margin:{l:150,r:100,t:100,b:100},title:{text:"Bubble size for PP=8"},xaxis:{title:{text:"Bubble size"}},width:900,height:650},{responsive:!0})</script> </div>
|
dist/fragments/pp_comm_bandwidth.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=91842954-4418-43b7-bc03-81a893ff71fe class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("91842954-4418-43b7-bc03-81a893ff71fe")&&Plotly.newPlot("91842954-4418-43b7-bc03-81a893ff71fe",[{fill:"toself",fillcolor:"rgba(78,165,183,0.2)",hoverinfo:"skip",line:{color:"rgba(255,255,255,0)"},showlegend:!1,x:[0,1,2,3,4,5,6,6,5,4,3,2,1,0],y:[461.538,436.13,219.09099999999998,192.73399999999992,188.42249999999999,194.227,177.35999999999999,12.493500000000001,24.965,31.34,41.587,44.509,302.33500000000004,422.288],type:"scatter"},{line:{color:"#4ea5b7",width:3},marker:{size:10,symbol:"circle"},mode:"lines+markers+text",name:"AllReduce",text:["436.0","361.7","160.1","99.6","84.7","64.9","32.9"],textposition:"bottom center",x:[0,1,2,3,4,5,6],y:[435.9668115942029,361.74920529801324,160.13950738916256,99.56427561837455,84.74052884615384,64.92543661971831,32.937847222222224],type:"scatter"},{fill:"toself",fillcolor:"rgba(232,137,171,0.2)",hoverinfo:"skip",line:{color:"rgba(255,255,255,0)"},showlegend:!1,x:[0,1,2,3,4,5,6,6,5,4,3,2,1,0],y:[264.93,226.26999999999998,229.40999999999997,178.47899999999998,126.6575,77.026,44.4165,6.1535,12.314,24.525000000000002,47.147,40.757000000000005,147.97500000000002,239.55200000000002],type:"scatter"},{line:{color:"#e889ab",width:3},marker:{size:10,symbol:"square"},mode:"lines+markers",name:"AllGather",x:[0,1,2,3,4,5,6],y:[249.84884057971013,184.61324503311258,118.96753694581281,68.99752650176679,54.972283653846155,27.969183098591547,11.038298611111111],type:"scatter"},{fill:"toself",fillcolor:"rgba(206,192,250,0.2)",hoverinfo:"skip",line:{color:"rgba(255,255,255,0)"},showlegend:!1,x:[0,1,2,3,4,5,6,6,5,4,3,2,1,0],y:[264.64599999999996,226.37,215.492,177.54299999999998,126.4825,77.289,45.1295,6.1005,12.39,24.544999999999998,46.802,41.176,146.1,240.804],type:"scatter"},{line:{color:"#cec0fa",width:3},marker:{size:10,symbol:"triangle-up"},mode:"lines+markers",name:"ReduceScatter",x:[0,1,2,3,4,5,6],y:[249.72898550724636,181.5535761589404,115.7576354679803,68.55106007067138,54.524230769230776,27.944281690140844,11.069652777777778],type:"scatter"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{title:{text:"Number of Nodes"},tickmode:"array",tickvals:[0,1,2,3,4,5,6],ticktext:["1","2","4","8","16","32","64"]},yaxis:{title:{text:"Bandwidth (GB/s)"},range:[0,480],gridcolor:"rgba(0,0,0,0.1)"},legend:{x:.85,y:1,bgcolor:"rgba(255,255,255,0.5)"},margin:{l:80,r:80,t:80,b:80},title:{text:"Communication Bandwidth by Number of Nodes (size=256MB)"},width:1e3,height:410},{responsive:!0})</script> </div>
|
dist/fragments/pp_memoryusage.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=8db330ae-52ab-4193-8ee5-dfa289fe4609 class=plotly-graph-div style="height:410px; width:800px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("8db330ae-52ab-4193-8ee5-dfa289fe4609")&&Plotly.newPlot("8db330ae-52ab-4193-8ee5-dfa289fe4609",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[14.95703125,14.95703125,14.95703125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[59.828125,59.828125,59.828125],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[2.603515625,2.603515625,2.603515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[2.603515625,2.603515625,2.603515625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[10.4140625,10.4140625,10.4140625],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray",title:{text:"Memory Usage (GB)"}},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"PP=8",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"}],title:{text:"Memory Usage for 8B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:800,height:410},{responsive:!0})</script> </div>
|
dist/fragments/tp_memoryusage.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=eed45cdb-8f9e-4c54-b31f-85062d3362e8 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("eed45cdb-8f9e-4c54-b31f-85062d3362e8")&&Plotly.newPlot("eed45cdb-8f9e-4c54-b31f-85062d3362e8",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[526,526,526],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[21.25,85,340],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[65.75,65.75,65.75],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[8.125,32.5,130],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[32.875,32.875,32.875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[7.1875,28.75,115],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,title:{text:"Memory Usage (GB)"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism (TP-1)",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=8",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=16",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage for 70B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:400},{responsive:!0})</script> </div>
|
dist/fragments/tp_scaling.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=8de1ceac-2d60-4368-8691-415db492ed15 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("8de1ceac-2d60-4368-8691-415db492ed15")&&Plotly.newPlot("8de1ceac-2d60-4368-8691-415db492ed15",[{marker:{color:"#4ea5b7"},name:"Tokens/sec/GPU",width:.7,x:["2","4","8","16","32"],y:[13923.18,12420.76,10903.32,6245.6,2146.44],type:"bar",xaxis:"x",yaxis:"y"},{base:[12420.76],marker:{color:"#e889ab"},name:"Performance Drop",showlegend:!0,width:.0875,x:["4"],y:[1502.42],type:"bar",xaxis:"x",yaxis:"y"},{base:[10903.32],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["8"],y:[1517.4400000000005],type:"bar",xaxis:"x",yaxis:"y"},{base:[6245.6],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["16"],y:[4657.719999999999],type:"bar",xaxis:"x",yaxis:"y"},{base:[2146.44],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["32"],y:[4099.16],type:"bar",xaxis:"x",yaxis:"y"},{marker:{color:"#cec0fa"},name:"Max Batch Size",text:["3","8","12","16","20"],textposition:"inside",width:.7,x:["2","4","8","16","32"],y:[3,8,12,16,20],type:"bar",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Tokens/sec/GPU"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],title:{text:"Maximum Batch Size"},showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Throughput Scaling with TP (3B Model)",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Maximum Batch Size per TP Value",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{color:"#e889ab"},showarrow:!1,text:"-10.8%",x:1,xanchor:"center",xref:"x",xshift:30,y:13171.970000000001,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-12.2%",x:2,xanchor:"center",xref:"x",xshift:30,y:11662.04,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-42.7%",x:3,xanchor:"center",xref:"x",xshift:30,y:8574.46,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-65.6%",x:4,xanchor:"center",xref:"x",xshift:30,y:4196.02,yanchor:"middle",yref:"y"}],legend:{x:.55,y:1},width:1e3,height:400,barmode:"stack"},{responsive:!0})</script> </div>
|
dist/fragments/tp_sp_memoryusage.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=16242941-9fab-40ed-996e-ddd93a5a627c class=plotly-graph-div style="height:410px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("16242941-9fab-40ed-996e-ddd93a5a627c")&&Plotly.newPlot("16242941-9fab-40ed-996e-ddd93a5a627c",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[131.5,131.5,131.5],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[16.4375,16.4375,16.4375],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[8.21875,8.21875,8.21875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[526,526,526],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[65.75,65.75,65.75],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[32.875,32.875,32.875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[21.25,85,340],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[2.65625,10.625,42.5],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[1.328125,5.3125,21.25],type:"bar",xaxis:"x3",yaxis:"y3"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2888888888888889],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray",title:{text:"Memory Usage (GB)"}},xaxis2:{anchor:"y2",domain:[.35555555555555557,.6444444444444445],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.7111111111111111,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,range:[0,150],dtick:20,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"No Parallelism",x:.14444444444444446,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=8 (with SP)",x:.5,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"TP=16 (with SP)",x:.8555555555555556,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"}],title:{text:"Memory Usage for 70B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:410},{responsive:!0})</script> </div>
|
dist/fragments/tp_sp_scaling.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=e996eac5-dceb-42af-9cc6-d21f247621f5 class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("e996eac5-dceb-42af-9cc6-d21f247621f5")&&Plotly.newPlot("e996eac5-dceb-42af-9cc6-d21f247621f5",[{marker:{color:"#4ea5b7"},name:"Tokens/sec/GPU",width:.7,x:["2","4","8","16","32"],y:[14167.25,13460.16,10888.53,6159.3,3609.73],type:"bar",xaxis:"x",yaxis:"y"},{base:[13460.16],marker:{color:"#e889ab"},name:"Performance Drop",showlegend:!0,width:.0875,x:["4"],y:[707.0900000000001],type:"bar",xaxis:"x",yaxis:"y"},{base:[10888.53],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["8"],y:[2571.629999999999],type:"bar",xaxis:"x",yaxis:"y"},{base:[6159.3],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["16"],y:[4729.2300000000005],type:"bar",xaxis:"x",yaxis:"y"},{base:[3609.73],marker:{color:"#e889ab"},showlegend:!1,width:.0875,x:["32"],y:[2549.57],type:"bar",xaxis:"x",yaxis:"y"},{marker:{color:"#cec0fa"},name:"Max Batch Size",text:["4","10","20","40","100"],textposition:"inside",width:.7,x:["2","4","8","16","32"],y:[4,10,20,40,100],type:"bar",xaxis:"x2",yaxis:"y2"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.45],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Tokens/sec/GPU"},showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.55,1],title:{text:"Tensor Parallelism (TP)"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],title:{text:"Maximum Batch Size"},showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"Throughput Scaling with TP/SP (3B Model)",x:.225,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"Maximum Batch Size per TP Value",x:.775,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{color:"#e889ab"},showarrow:!1,text:"-5.0%",x:1,xanchor:"center",xref:"x",xshift:30,y:13813.705,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-19.1%",x:2,xanchor:"center",xref:"x",xshift:30,y:12174.345000000001,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-43.4%",x:3,xanchor:"center",xref:"x",xshift:30,y:8523.915,yanchor:"middle",yref:"y"},{font:{color:"#e889ab"},showarrow:!1,text:"-41.4%",x:4,xanchor:"center",xref:"x",xshift:30,y:4884.515,yanchor:"middle",yref:"y"}],legend:{x:.55,y:1},width:1e3,height:400,barmode:"stack"},{responsive:!0})</script> </div>
|
dist/fragments/zero3_memoryusage.html
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
<div> <div id=be6d1423-335a-45b6-a848-dc4d0d70706f class=plotly-graph-div style="height:400px; width:1000px;"></div> <script>window.PLOTLYENV=window.PLOTLYENV||{},document.getElementById("be6d1423-335a-45b6-a848-dc4d0d70706f")&&Plotly.newPlot("be6d1423-335a-45b6-a848-dc4d0d70706f",[{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!0,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!0,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!0,x:["1024","4096","16384"],y:[60,60,60],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!0,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x",yaxis:"y"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[7.5,7.5,7.5],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x2",yaxis:"y2"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[15,15,15],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[1.875,1.875,1.875],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[7.5,7.5,7.5],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x3",yaxis:"y3"},{legendgroup:"Model Parameters",marker:{color:"#4ea5b7"},name:"Model Parameters",showlegend:!1,x:["1024","4096","16384"],y:[1.875,1.875,1.875],type:"bar",xaxis:"x4",yaxis:"y4"},{legendgroup:"Gradients",marker:{color:"#e889ab"},name:"Gradients",showlegend:!1,x:["1024","4096","16384"],y:[1.875,1.875,1.875],type:"bar",xaxis:"x4",yaxis:"y4"},{legendgroup:"Optimizer States",marker:{color:"#cec0fa"},name:"Optimizer States",showlegend:!1,x:["1024","4096","16384"],y:[7.5,7.5,7.5],type:"bar",xaxis:"x4",yaxis:"y4"},{legendgroup:"Activations",marker:{color:"#e38a42"},name:"Activations",showlegend:!1,x:["1024","4096","16384"],y:[4.25,17,68],type:"bar",xaxis:"x4",yaxis:"y4"}],{template:{data:{histogram2dcontour:[{type:"histogram2dcontour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],choropleth:[{type:"choropleth",colorbar:{outlinewidth:0,ticks:""}}],histogram2d:[{type:"histogram2d",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmap:[{type:"heatmap",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],heatmapgl:[{type:"heatmapgl",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],contourcarpet:[{type:"contourcarpet",colorbar:{outlinewidth:0,ticks:""}}],contour:[{type:"contour",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],surface:[{type:"surface",colorbar:{outlinewidth:0,ticks:""},colorscale:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]]}],mesh3d:[{type:"mesh3d",colorbar:{outlinewidth:0,ticks:""}}],scatter:[{fillpattern:{fillmode:"overlay",size:10,solidity:.2},type:"scatter"}],parcoords:[{type:"parcoords",line:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolargl:[{type:"scatterpolargl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],bar:[{error_x:{color:"#2a3f5f"},error_y:{color:"#2a3f5f"},marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"bar"}],scattergeo:[{type:"scattergeo",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterpolar:[{type:"scatterpolar",marker:{colorbar:{outlinewidth:0,ticks:""}}}],histogram:[{marker:{pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"histogram"}],scattergl:[{type:"scattergl",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatter3d:[{type:"scatter3d",line:{colorbar:{outlinewidth:0,ticks:""}},marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattermapbox:[{type:"scattermapbox",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scatterternary:[{type:"scatterternary",marker:{colorbar:{outlinewidth:0,ticks:""}}}],scattercarpet:[{type:"scattercarpet",marker:{colorbar:{outlinewidth:0,ticks:""}}}],carpet:[{aaxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},baxis:{endlinecolor:"#2a3f5f",gridcolor:"white",linecolor:"white",minorgridcolor:"white",startlinecolor:"#2a3f5f"},type:"carpet"}],table:[{cells:{fill:{color:"#EBF0F8"},line:{color:"white"}},header:{fill:{color:"#C8D4E3"},line:{color:"white"}},type:"table"}],barpolar:[{marker:{line:{color:"#E5ECF6",width:.5},pattern:{fillmode:"overlay",size:10,solidity:.2}},type:"barpolar"}],pie:[{automargin:!0,type:"pie"}]},layout:{autotypenumbers:"strict",colorway:["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],font:{color:"#2a3f5f"},hovermode:"closest",hoverlabel:{align:"left"},paper_bgcolor:"white",plot_bgcolor:"#E5ECF6",polar:{bgcolor:"#E5ECF6",angularaxis:{gridcolor:"white",linecolor:"white",ticks:""},radialaxis:{gridcolor:"white",linecolor:"white",ticks:""}},ternary:{bgcolor:"#E5ECF6",aaxis:{gridcolor:"white",linecolor:"white",ticks:""},baxis:{gridcolor:"white",linecolor:"white",ticks:""},caxis:{gridcolor:"white",linecolor:"white",ticks:""}},coloraxis:{colorbar:{outlinewidth:0,ticks:""}},colorscale:{sequential:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],sequentialminus:[[0,"#0d0887"],[.1111111111111111,"#46039f"],[.2222222222222222,"#7201a8"],[.3333333333333333,"#9c179e"],[.4444444444444444,"#bd3786"],[.5555555555555556,"#d8576b"],[.6666666666666666,"#ed7953"],[.7777777777777778,"#fb9f3a"],[.8888888888888888,"#fdca26"],[1,"#f0f921"]],diverging:[[0,"#8e0152"],[.1,"#c51b7d"],[.2,"#de77ae"],[.3,"#f1b6da"],[.4,"#fde0ef"],[.5,"#f7f7f7"],[.6,"#e6f5d0"],[.7,"#b8e186"],[.8,"#7fbc41"],[.9,"#4d9221"],[1,"#276419"]]},xaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},yaxis:{gridcolor:"white",linecolor:"white",ticks:"",title:{standoff:15},zerolinecolor:"white",automargin:!0,zerolinewidth:2},scene:{xaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},yaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2},zaxis:{backgroundcolor:"#E5ECF6",gridcolor:"white",linecolor:"white",showbackground:!0,ticks:"",zerolinecolor:"white",gridwidth:2}},shapedefaults:{line:{color:"#2a3f5f"}},annotationdefaults:{arrowcolor:"#2a3f5f",arrowhead:0,arrowwidth:1},geo:{bgcolor:"white",landcolor:"#E5ECF6",subunitcolor:"white",showland:!0,showlakes:!0,lakecolor:"white"},title:{x:.05},mapbox:{style:"light"}}},xaxis:{anchor:"y",domain:[0,.2125],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis:{anchor:"x",domain:[0,1],title:{text:"Memory Usage (GB)"},dtick:20,showgrid:!0,gridcolor:"LightGray"},xaxis2:{anchor:"y2",domain:[.2625,.475],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis2:{anchor:"x2",domain:[0,1],matches:"y",showticklabels:!1,showgrid:!0,gridcolor:"LightGray"},xaxis3:{anchor:"y3",domain:[.525,.7375],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis3:{anchor:"x3",domain:[0,1],matches:"y",showticklabels:!1,showgrid:!0,gridcolor:"LightGray"},xaxis4:{anchor:"y4",domain:[.7875,1],title:{text:"Sequence Length"},showgrid:!0,gridcolor:"LightGray"},yaxis4:{anchor:"x4",domain:[0,1],matches:"y",showticklabels:!1,showgrid:!0,gridcolor:"LightGray"},annotations:[{font:{size:16},showarrow:!1,text:"DP=8",x:.10625,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"DP=8 Zero-1",x:.36875,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"DP=8 Zero-2",x:.6312500000000001,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"},{font:{size:16},showarrow:!1,text:"DP=8 Zero-3",x:.89375,xanchor:"center",xref:"paper",y:1,yanchor:"bottom",yref:"paper"}],shapes:[{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x domain",y0:80,y1:80,yref:"y"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x2 domain",y0:80,y1:80,yref:"y2"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x3 domain",y0:80,y1:80,yref:"y3"},{line:{color:"red",dash:"dash"},type:"line",x0:0,x1:1,xref:"x4 domain",y0:80,y1:80,yref:"y4"}],title:{text:"Memory Usage for 8B Model"},legend:{orientation:"v",x:1.02,y:.5},margin:{r:150},barmode:"stack",width:1e3,height:400},{responsive:!0})</script> </div>
|
dist/index.html
CHANGED
@@ -208,7 +208,6 @@
|
|
208 |
|
209 |
<!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
|
210 |
<div id="fragment-benchmarks_interactive"></div>
|
211 |
-
<!-- -->
|
212 |
|
213 |
<p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
|
214 |
|
@@ -339,7 +338,8 @@
|
|
339 |
<div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
|
340 |
<script src="../assets/images/first_steps_memory_profile.js"></script>
|
341 |
-->
|
342 |
-
|
|
|
343 |
|
344 |
<p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
|
345 |
|
@@ -460,14 +460,14 @@
|
|
460 |
|
461 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
462 |
|
463 |
-
<
|
464 |
-
<script>
|
465 |
window.addEventListener('load', function() {
|
466 |
const frame = document.getElementById('plotFrame3');
|
467 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
468 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
469 |
});
|
470 |
-
</script>
|
471 |
|
472 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
473 |
|
@@ -495,7 +495,7 @@
|
|
495 |
|
496 |
<p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
|
497 |
|
498 |
-
|
499 |
|
500 |
<p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
|
501 |
|
@@ -734,14 +734,14 @@
|
|
734 |
<p>Lets see this happening in practice with some benchmark:</p>
|
735 |
|
736 |
<!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
|
737 |
-
<
|
738 |
-
<script>
|
739 |
window.addEventListener('load', function() {
|
740 |
const frame = document.getElementById('plotFrame4');
|
741 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
742 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
743 |
});
|
744 |
-
</script>
|
745 |
|
746 |
<p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
|
747 |
|
@@ -750,14 +750,15 @@
|
|
750 |
<p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
|
751 |
<aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
|
752 |
|
753 |
-
<iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
754 |
-
<
|
|
|
755 |
window.addEventListener('load', function() {
|
756 |
const frame = document.getElementById('plotFrame5');
|
757 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
758 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
759 |
});
|
760 |
-
</script>
|
761 |
<!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
|
762 |
|
763 |
|
@@ -907,14 +908,15 @@
|
|
907 |
|
908 |
<p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
|
909 |
|
910 |
-
<iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
911 |
-
<
|
|
|
912 |
window.addEventListener('load', function() {
|
913 |
const frame = document.getElementById('plotFrame6');
|
914 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
915 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
916 |
});
|
917 |
-
</script>
|
918 |
<!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
|
919 |
|
920 |
<p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
|
@@ -1013,14 +1015,15 @@
|
|
1013 |
|
1014 |
<p> Let's take a better look at the trade-off as we scale the TP degree:</p>
|
1015 |
|
1016 |
-
<iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1017 |
-
<
|
|
|
1018 |
window.addEventListener('load', function() {
|
1019 |
const frame = document.getElementById('plotFrame13');
|
1020 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1021 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1022 |
});
|
1023 |
-
</script>
|
1024 |
<!--
|
1025 |
<p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
|
1026 |
|
@@ -1030,14 +1033,15 @@
|
|
1030 |
|
1031 |
<p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
|
1032 |
|
1033 |
-
<iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1034 |
-
<
|
|
|
1035 |
window.addEventListener('load', function() {
|
1036 |
const frame = document.getElementById('plotFrame7');
|
1037 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1038 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1039 |
-
});
|
1040 |
-
</script>
|
1041 |
<!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
|
1042 |
|
1043 |
<p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
|
@@ -1194,14 +1198,15 @@
|
|
1194 |
|
1195 |
<p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
|
1196 |
|
1197 |
-
<iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1198 |
-
<
|
|
|
1199 |
window.addEventListener('load', function() {
|
1200 |
const frame = document.getElementById('plotFrame8');
|
1201 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1202 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1203 |
-
});
|
1204 |
-
</script>
|
1205 |
<!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
|
1206 |
|
1207 |
<p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
|
@@ -1216,14 +1221,15 @@
|
|
1216 |
|
1217 |
<p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
|
1218 |
|
1219 |
-
<iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1220 |
-
<
|
|
|
1221 |
window.addEventListener('load', function() {
|
1222 |
const frame = document.getElementById('plotFrame2');
|
1223 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1224 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1225 |
-
});
|
1226 |
-
</script>
|
1227 |
|
1228 |
<!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
|
1229 |
<p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
|
@@ -1255,14 +1261,15 @@
|
|
1255 |
|
1256 |
<p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
|
1257 |
|
1258 |
-
<iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1259 |
-
<
|
|
|
1260 |
window.addEventListener('load', function() {
|
1261 |
const frame = document.getElementById('plotFrame9');
|
1262 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1263 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1264 |
});
|
1265 |
-
</script>
|
1266 |
|
1267 |
<!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
|
1268 |
|
@@ -1356,14 +1363,15 @@
|
|
1356 |
|
1357 |
<p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
|
1358 |
|
1359 |
-
<iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1360 |
-
<
|
|
|
1361 |
window.addEventListener('load', function() {
|
1362 |
const frame = document.getElementById('plotFrame11');
|
1363 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1364 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1365 |
});
|
1366 |
-
</script>
|
1367 |
|
1368 |
<!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
|
1369 |
<p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
|
@@ -1374,14 +1382,15 @@
|
|
1374 |
|
1375 |
<aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
|
1376 |
|
1377 |
-
<iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1378 |
-
<
|
|
|
1379 |
window.addEventListener('load', function() {
|
1380 |
const frame = document.getElementById('plotFrame12');
|
1381 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1382 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1383 |
});
|
1384 |
-
</script>
|
1385 |
<!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
|
1386 |
|
1387 |
<p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
|
@@ -1507,14 +1516,15 @@
|
|
1507 |
|
1508 |
<p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
|
1509 |
|
1510 |
-
<iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1511 |
-
<
|
|
|
1512 |
window.addEventListener('load', function() {
|
1513 |
const frame = document.getElementById('plotFrame23');
|
1514 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1515 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1516 |
});
|
1517 |
-
</script>
|
1518 |
<!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
|
1519 |
|
1520 |
|
@@ -2312,6 +2322,8 @@
|
|
2312 |
<p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
|
2313 |
|
2314 |
<iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
|
|
|
|
2315 |
|
2316 |
<p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
|
2317 |
|
@@ -2730,8 +2742,7 @@
|
|
2730 |
<div>
|
2731 |
<a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
|
2732 |
<p>Easy explanation of Flash Attention</p>
|
2733 |
-
</div>
|
2734 |
-
|
2735 |
<div>
|
2736 |
<a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
|
2737 |
<p>Large-scale language modeling tutorials with PyTorch.</p>
|
@@ -3242,4 +3253,4 @@
|
|
3242 |
|
3243 |
</body>
|
3244 |
|
3245 |
-
</html>
|
|
|
208 |
|
209 |
<!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
|
210 |
<div id="fragment-benchmarks_interactive"></div>
|
|
|
211 |
|
212 |
<p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
|
213 |
|
|
|
338 |
<div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
|
339 |
<script src="../assets/images/first_steps_memory_profile.js"></script>
|
340 |
-->
|
341 |
+
<!-- -->
|
342 |
+
<div id="fragment-memory-profile"></div>
|
343 |
|
344 |
<p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
|
345 |
|
|
|
460 |
|
461 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
462 |
|
463 |
+
<div class="l-body-outset" id="fragment-memusage_activations"></div>
|
464 |
+
<!-- <script>
|
465 |
window.addEventListener('load', function() {
|
466 |
const frame = document.getElementById('plotFrame3');
|
467 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
468 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
469 |
});
|
470 |
+
</script> -->
|
471 |
|
472 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
473 |
|
|
|
495 |
|
496 |
<p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
|
497 |
|
498 |
+
<div id="fragment-memory-recomputation"></div>
|
499 |
|
500 |
<p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
|
501 |
|
|
|
734 |
<p>Lets see this happening in practice with some benchmark:</p>
|
735 |
|
736 |
<!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
|
737 |
+
<div class="l-body-outset" id="fragment-dp_scaling"></div>
|
738 |
+
<!-- <script>
|
739 |
window.addEventListener('load', function() {
|
740 |
const frame = document.getElementById('plotFrame4');
|
741 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
742 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
743 |
});
|
744 |
+
</script> -->
|
745 |
|
746 |
<p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
|
747 |
|
|
|
750 |
<p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
|
751 |
<aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
|
752 |
|
753 |
+
<!-- <iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
754 |
+
<div class="l-body-outset" id="fragment-dp_ourjourney_memoryusage"></div>
|
755 |
+
<!-- <script>
|
756 |
window.addEventListener('load', function() {
|
757 |
const frame = document.getElementById('plotFrame5');
|
758 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
759 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
760 |
});
|
761 |
+
</script> -->
|
762 |
<!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
|
763 |
|
764 |
|
|
|
908 |
|
909 |
<p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
|
910 |
|
911 |
+
<!-- <iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
912 |
+
<div class="l-body-outset" id="fragment-zero3_memoryusage"></div>
|
913 |
+
<!-- <script>
|
914 |
window.addEventListener('load', function() {
|
915 |
const frame = document.getElementById('plotFrame6');
|
916 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
917 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
918 |
});
|
919 |
+
</script> -->
|
920 |
<!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
|
921 |
|
922 |
<p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
|
|
|
1015 |
|
1016 |
<p> Let's take a better look at the trade-off as we scale the TP degree:</p>
|
1017 |
|
1018 |
+
<!-- <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1019 |
+
<div class="l-body-outset" id="fragment-tp_scaling"></div>
|
1020 |
+
<!-- <script>
|
1021 |
window.addEventListener('load', function() {
|
1022 |
const frame = document.getElementById('plotFrame13');
|
1023 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1024 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1025 |
});
|
1026 |
+
</script> -->
|
1027 |
<!--
|
1028 |
<p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
|
1029 |
|
|
|
1033 |
|
1034 |
<p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
|
1035 |
|
1036 |
+
<!-- <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1037 |
+
<div class="l-body-outset" id="fragment-tp_memoryusage"></div>
|
1038 |
+
<!-- <script>
|
1039 |
window.addEventListener('load', function() {
|
1040 |
const frame = document.getElementById('plotFrame7');
|
1041 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1042 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1043 |
+
}); -->
|
1044 |
+
<!-- </script> -->
|
1045 |
<!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
|
1046 |
|
1047 |
<p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
|
|
|
1198 |
|
1199 |
<p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
|
1200 |
|
1201 |
+
<!-- <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1202 |
+
<div class="l-body-outset" id="fragment-tp_sp_memoryusage"></div>
|
1203 |
+
<!-- <script>
|
1204 |
window.addEventListener('load', function() {
|
1205 |
const frame = document.getElementById('plotFrame8');
|
1206 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1207 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1208 |
+
}); -->
|
1209 |
+
<!-- </script> -->
|
1210 |
<!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
|
1211 |
|
1212 |
<p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
|
|
|
1221 |
|
1222 |
<p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
|
1223 |
|
1224 |
+
<!-- <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1225 |
+
<div class="l-body-outset" id="fragment-tp_sp_scaling"></div>
|
1226 |
+
<!-- <script>
|
1227 |
window.addEventListener('load', function() {
|
1228 |
const frame = document.getElementById('plotFrame2');
|
1229 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1230 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1231 |
+
}); -->
|
1232 |
+
<!-- </script> -->
|
1233 |
|
1234 |
<!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
|
1235 |
<p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
|
|
|
1261 |
|
1262 |
<p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
|
1263 |
|
1264 |
+
<!-- <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1265 |
+
<div class="l-body-outset" id="fragment-cp_8Bmemoryusage"></div>
|
1266 |
+
<!-- <script>
|
1267 |
window.addEventListener('load', function() {
|
1268 |
const frame = document.getElementById('plotFrame9');
|
1269 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1270 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1271 |
});
|
1272 |
+
</script> -->
|
1273 |
|
1274 |
<!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
|
1275 |
|
|
|
1363 |
|
1364 |
<p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
|
1365 |
|
1366 |
+
<!-- <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1367 |
+
<div class="l-body-outset" id="fragment-pp_comm_bandwidth"></div>
|
1368 |
+
<!-- <script>
|
1369 |
window.addEventListener('load', function() {
|
1370 |
const frame = document.getElementById('plotFrame11');
|
1371 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1372 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1373 |
});
|
1374 |
+
</script> -->
|
1375 |
|
1376 |
<!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
|
1377 |
<p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
|
|
|
1382 |
|
1383 |
<aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
|
1384 |
|
1385 |
+
<!-- <iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1386 |
+
<div class="l-body" id="fragment-pp_memoryusage"></div>
|
1387 |
+
<!-- <script>
|
1388 |
window.addEventListener('load', function() {
|
1389 |
const frame = document.getElementById('plotFrame12');
|
1390 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1391 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1392 |
});
|
1393 |
+
</script> -->
|
1394 |
<!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
|
1395 |
|
1396 |
<p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
|
|
|
1516 |
|
1517 |
<p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
|
1518 |
|
1519 |
+
<!-- <iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1520 |
+
<div class="l-body" id="fragment-pp_bubblesize"></div>
|
1521 |
+
<!-- <script>
|
1522 |
window.addEventListener('load', function() {
|
1523 |
const frame = document.getElementById('plotFrame23');
|
1524 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1525 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1526 |
});
|
1527 |
+
</script> -->
|
1528 |
<!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
|
1529 |
|
1530 |
|
|
|
2322 |
<p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
|
2323 |
|
2324 |
<iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
2325 |
+
<!-- Hynek uncomment this once it's added to -->
|
2326 |
+
<!-- <div class="l-body-outset" id="fragment-fp8_training_loss_curves"></div> -->
|
2327 |
|
2328 |
<p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
|
2329 |
|
|
|
2742 |
<div>
|
2743 |
<a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
|
2744 |
<p>Easy explanation of Flash Attention</p>
|
2745 |
+
</div>
|
|
|
2746 |
<div>
|
2747 |
<a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
|
2748 |
<p>Large-scale language modeling tutorials with PyTorch.</p>
|
|
|
3253 |
|
3254 |
</body>
|
3255 |
|
3256 |
+
</html>
|
src/index.html
CHANGED
@@ -208,7 +208,6 @@
|
|
208 |
|
209 |
<!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
|
210 |
<div id="fragment-benchmarks_interactive"></div>
|
211 |
-
<!-- {{{benchmarks-interactive-html}}} -->
|
212 |
|
213 |
<p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
|
214 |
|
@@ -339,8 +338,8 @@
|
|
339 |
<div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
|
340 |
<script src="../assets/images/first_steps_memory_profile.js"></script>
|
341 |
-->
|
342 |
-
{{!-- <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe> --}}
|
343 |
-
|
344 |
|
345 |
<p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
|
346 |
|
@@ -461,14 +460,14 @@
|
|
461 |
|
462 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
463 |
|
464 |
-
<
|
465 |
-
<script>
|
466 |
window.addEventListener('load', function() {
|
467 |
const frame = document.getElementById('plotFrame3');
|
468 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
469 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
470 |
});
|
471 |
-
</script>
|
472 |
|
473 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
474 |
|
@@ -496,7 +495,7 @@
|
|
496 |
|
497 |
<p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
|
498 |
|
499 |
-
|
500 |
|
501 |
<p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
|
502 |
|
@@ -735,14 +734,14 @@
|
|
735 |
<p>Lets see this happening in practice with some benchmark:</p>
|
736 |
|
737 |
<!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
|
738 |
-
<
|
739 |
-
<script>
|
740 |
window.addEventListener('load', function() {
|
741 |
const frame = document.getElementById('plotFrame4');
|
742 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
743 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
744 |
});
|
745 |
-
</script>
|
746 |
|
747 |
<p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
|
748 |
|
@@ -751,14 +750,15 @@
|
|
751 |
<p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
|
752 |
<aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
|
753 |
|
754 |
-
<iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
755 |
-
<
|
|
|
756 |
window.addEventListener('load', function() {
|
757 |
const frame = document.getElementById('plotFrame5');
|
758 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
759 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
760 |
});
|
761 |
-
</script>
|
762 |
<!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
|
763 |
|
764 |
|
@@ -908,14 +908,15 @@
|
|
908 |
|
909 |
<p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
|
910 |
|
911 |
-
<iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
912 |
-
<
|
|
|
913 |
window.addEventListener('load', function() {
|
914 |
const frame = document.getElementById('plotFrame6');
|
915 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
916 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
917 |
});
|
918 |
-
</script>
|
919 |
<!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
|
920 |
|
921 |
<p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
|
@@ -1014,14 +1015,15 @@
|
|
1014 |
|
1015 |
<p> Let's take a better look at the trade-off as we scale the TP degree:</p>
|
1016 |
|
1017 |
-
<iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1018 |
-
<
|
|
|
1019 |
window.addEventListener('load', function() {
|
1020 |
const frame = document.getElementById('plotFrame13');
|
1021 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1022 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1023 |
});
|
1024 |
-
</script>
|
1025 |
<!--
|
1026 |
<p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
|
1027 |
|
@@ -1031,14 +1033,15 @@
|
|
1031 |
|
1032 |
<p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
|
1033 |
|
1034 |
-
<iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1035 |
-
<
|
|
|
1036 |
window.addEventListener('load', function() {
|
1037 |
const frame = document.getElementById('plotFrame7');
|
1038 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1039 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1040 |
-
});
|
1041 |
-
</script>
|
1042 |
<!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
|
1043 |
|
1044 |
<p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
|
@@ -1195,14 +1198,15 @@
|
|
1195 |
|
1196 |
<p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
|
1197 |
|
1198 |
-
<iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1199 |
-
<
|
|
|
1200 |
window.addEventListener('load', function() {
|
1201 |
const frame = document.getElementById('plotFrame8');
|
1202 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1203 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1204 |
-
});
|
1205 |
-
</script>
|
1206 |
<!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
|
1207 |
|
1208 |
<p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
|
@@ -1217,14 +1221,15 @@
|
|
1217 |
|
1218 |
<p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
|
1219 |
|
1220 |
-
<iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1221 |
-
<
|
|
|
1222 |
window.addEventListener('load', function() {
|
1223 |
const frame = document.getElementById('plotFrame2');
|
1224 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1225 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1226 |
-
});
|
1227 |
-
</script>
|
1228 |
|
1229 |
<!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
|
1230 |
<p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
|
@@ -1256,14 +1261,15 @@
|
|
1256 |
|
1257 |
<p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
|
1258 |
|
1259 |
-
<iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1260 |
-
<
|
|
|
1261 |
window.addEventListener('load', function() {
|
1262 |
const frame = document.getElementById('plotFrame9');
|
1263 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1264 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1265 |
});
|
1266 |
-
</script>
|
1267 |
|
1268 |
<!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
|
1269 |
|
@@ -1357,14 +1363,15 @@
|
|
1357 |
|
1358 |
<p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
|
1359 |
|
1360 |
-
<iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1361 |
-
<
|
|
|
1362 |
window.addEventListener('load', function() {
|
1363 |
const frame = document.getElementById('plotFrame11');
|
1364 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1365 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1366 |
});
|
1367 |
-
</script>
|
1368 |
|
1369 |
<!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
|
1370 |
<p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
|
@@ -1375,14 +1382,15 @@
|
|
1375 |
|
1376 |
<aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
|
1377 |
|
1378 |
-
<iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1379 |
-
<
|
|
|
1380 |
window.addEventListener('load', function() {
|
1381 |
const frame = document.getElementById('plotFrame12');
|
1382 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1383 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1384 |
});
|
1385 |
-
</script>
|
1386 |
<!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
|
1387 |
|
1388 |
<p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
|
@@ -1508,14 +1516,15 @@
|
|
1508 |
|
1509 |
<p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
|
1510 |
|
1511 |
-
<iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
1512 |
-
<
|
|
|
1513 |
window.addEventListener('load', function() {
|
1514 |
const frame = document.getElementById('plotFrame23');
|
1515 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1516 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1517 |
});
|
1518 |
-
</script>
|
1519 |
<!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
|
1520 |
|
1521 |
|
@@ -2313,6 +2322,8 @@
|
|
2313 |
<p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
|
2314 |
|
2315 |
<iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
|
|
|
|
2316 |
|
2317 |
<p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
|
2318 |
|
@@ -2731,8 +2742,7 @@
|
|
2731 |
<div>
|
2732 |
<a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
|
2733 |
<p>Easy explanation of Flash Attention</p>
|
2734 |
-
</div>
|
2735 |
-
|
2736 |
<div>
|
2737 |
<a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
|
2738 |
<p>Large-scale language modeling tutorials with PyTorch.</p>
|
@@ -3243,4 +3253,4 @@
|
|
3243 |
|
3244 |
</body>
|
3245 |
|
3246 |
-
</html>
|
|
|
208 |
|
209 |
<!-- <iframe id="plotFrame" src="assets/data/benchmarks/benchmarks_interactive.html" scrolling="no" frameborder="0" height="840" width="720"></iframe> -->
|
210 |
<div id="fragment-benchmarks_interactive"></div>
|
|
|
211 |
|
212 |
<p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on the challenges we'll cover in the book.</p>
|
213 |
|
|
|
338 |
<div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
|
339 |
<script src="../assets/images/first_steps_memory_profile.js"></script>
|
340 |
-->
|
341 |
+
<!-- {{!-- <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe> --}} -->
|
342 |
+
<div id="fragment-memory-profile"></div>
|
343 |
|
344 |
<p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
|
345 |
|
|
|
460 |
|
461 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
462 |
|
463 |
+
<div class="l-body-outset" id="fragment-memusage_activations"></div>
|
464 |
+
<!-- <script>
|
465 |
window.addEventListener('load', function() {
|
466 |
const frame = document.getElementById('plotFrame3');
|
467 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
468 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
469 |
});
|
470 |
+
</script> -->
|
471 |
|
472 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
473 |
|
|
|
495 |
|
496 |
<p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>
|
497 |
|
498 |
+
<div id="fragment-memory-recomputation"></div>
|
499 |
|
500 |
<p>Another trend that's clearly visibile here is how the activations for long sequences play a bigger role for smaller models, so the effect of recomputation becomes even more noticeable.</p>
|
501 |
|
|
|
734 |
<p>Lets see this happening in practice with some benchmark:</p>
|
735 |
|
736 |
<!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
|
737 |
+
<div class="l-body-outset" id="fragment-dp_scaling"></div>
|
738 |
+
<!-- <script>
|
739 |
window.addEventListener('load', function() {
|
740 |
const frame = document.getElementById('plotFrame4');
|
741 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
742 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
743 |
});
|
744 |
+
</script> -->
|
745 |
|
746 |
<p>We see that above some limit, our throughput starts to drop quite significantly while the memory usage per GPU stays constant and is not affected by adding more DP ranks.</p>
|
747 |
|
|
|
750 |
<p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>
|
751 |
<aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>
|
752 |
|
753 |
+
<!-- <iframe class="l-body-outset" id="plotFrame5" src="assets/data/benchmarks/dp_ourjourney_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
754 |
+
<div class="l-body-outset" id="fragment-dp_ourjourney_memoryusage"></div>
|
755 |
+
<!-- <script>
|
756 |
window.addEventListener('load', function() {
|
757 |
const frame = document.getElementById('plotFrame5');
|
758 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
759 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
760 |
});
|
761 |
+
</script> -->
|
762 |
<!-- <p><img alt="dp_ourjourney_memoryusage.svg" src="/assets/images/dp_ourjourney_memoryusage.svg" /></p> -->
|
763 |
|
764 |
|
|
|
908 |
|
909 |
<p>However, there is a limit here, DP only works if a layer of the model fits in a single GPU and ZeRO can only partition the parameters, gradients, and optimizer states, but not the activation memory! We recall from <a target="_self" href="#memory_usage_in_transformers">the activation memory discussion</a> that this part of the memory scales with sequence length and batch size. Naturally we could just limit those, but in practice we don’t want to be limited by hardware to train with only with a short sequence length. </p>
|
910 |
|
911 |
+
<!-- <iframe class="l-body-outset" id="plotFrame6" src="assets/data/benchmarks/zero3_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
912 |
+
<div class="l-body-outset" id="fragment-zero3_memoryusage"></div>
|
913 |
+
<!-- <script>
|
914 |
window.addEventListener('load', function() {
|
915 |
const frame = document.getElementById('plotFrame6');
|
916 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
917 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
918 |
});
|
919 |
+
</script> -->
|
920 |
<!-- <p><img alt="zero3_memoryusage.svg" src="/assets/images/zero3_memoryusage.svg" /></p> -->
|
921 |
|
922 |
<p>To overcome this issues, it's time to explore a new, orthogonal axis of parallelism - Tensor Parallelism (TP). Unlike ZeRO3 which relies on heavy parameter communication, TP proposes to shard parameters, gradients, optimizer states AND activations across devices without requiring any communication of model parameters between GPUs.</p>
|
|
|
1015 |
|
1016 |
<p> Let's take a better look at the trade-off as we scale the TP degree:</p>
|
1017 |
|
1018 |
+
<!-- <iframe class="l-body-outset" id="plotFrame13" src="assets/data/benchmarks/tp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1019 |
+
<div class="l-body-outset" id="fragment-tp_scaling"></div>
|
1020 |
+
<!-- <script>
|
1021 |
window.addEventListener('load', function() {
|
1022 |
const frame = document.getElementById('plotFrame13');
|
1023 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1024 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1025 |
});
|
1026 |
+
</script> -->
|
1027 |
<!--
|
1028 |
<p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/tp_scaling.svg" /></p> -->
|
1029 |
|
|
|
1033 |
|
1034 |
<p>This being said, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>
|
1035 |
|
1036 |
+
<!-- <iframe class="l-body-outset" id="plotFrame7" src="assets/data/benchmarks/tp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1037 |
+
<div class="l-body-outset" id="fragment-tp_memoryusage"></div>
|
1038 |
+
<!-- <script>
|
1039 |
window.addEventListener('load', function() {
|
1040 |
const frame = document.getElementById('plotFrame7');
|
1041 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1042 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1043 |
+
}); -->
|
1044 |
+
<!-- </script> -->
|
1045 |
<!-- <p><img alt="tp_memoryusage.svg" src="/assets/images/tp_memoryusage.svg" /></p> -->
|
1046 |
|
1047 |
<p>Increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU to the point where we can start fitting a large model on a single node of 8 GPUs. </p>
|
|
|
1198 |
|
1199 |
<p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>
|
1200 |
|
1201 |
+
<!-- <iframe class="l-body-outset" id="plotFrame8" src="assets/data/benchmarks/tp_sp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1202 |
+
<div class="l-body-outset" id="fragment-tp_sp_memoryusage"></div>
|
1203 |
+
<!-- <script>
|
1204 |
window.addEventListener('load', function() {
|
1205 |
const frame = document.getElementById('plotFrame8');
|
1206 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1207 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1208 |
+
}); -->
|
1209 |
+
<!-- </script> -->
|
1210 |
<!-- <p><img alt="tp_sp_memoryusage.svg" src="/assets/images/tp_sp_memoryusage.svg" /></p> -->
|
1211 |
|
1212 |
<p>As we can see, we've again strongly reduced the maximum memory usage per GPU, allowing us to fit sequence lengths of 16k tokens with TP/SP=16, an improvement over the vanilla TP case! (TP=16 is still a bit large as we've seen in the previous section, but we'll see how we can improve this in the next section).</p>
|
|
|
1221 |
|
1222 |
<p>We can benchmark how this communication overhead becomes increasingly problematic as we scale up tensor parallelism. Let’s measure the throughput and memory utilization as we scale TP with SP for a 3B model with 4096 seqlen:</p>
|
1223 |
|
1224 |
+
<!-- <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/tp_sp_scaling.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1225 |
+
<div class="l-body-outset" id="fragment-tp_sp_scaling"></div>
|
1226 |
+
<!-- <script>
|
1227 |
window.addEventListener('load', function() {
|
1228 |
const frame = document.getElementById('plotFrame2');
|
1229 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1230 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1231 |
+
}); -->
|
1232 |
+
<!-- </script> -->
|
1233 |
|
1234 |
<!-- <p><img alt="tp_sp_scaling.svg" src="/assets/images/tp_sp_scaling.svg" /></p> -->
|
1235 |
<p>Here again, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees enable processing of significantly larger batch sizes by reducing the activation memory, they also reduce per-GPU throughput, in particular above a threshold corresponding to the number of GPUs per node.</p>
|
|
|
1261 |
|
1262 |
<p>Moreover, even if we use full recomputation of the activations (which comes at a heavy compute overhead of ~30%), we still need to hold in memory some activations at the layer boundaries which scale linearly with sequence length. Let's take a look and see how Context Parallelism can help us:</p>
|
1263 |
|
1264 |
+
<!-- <iframe class="l-body-outset" id="plotFrame9" src="assets/data/benchmarks/cp_8Bmemoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1265 |
+
<div class="l-body-outset" id="fragment-cp_8Bmemoryusage"></div>
|
1266 |
+
<!-- <script>
|
1267 |
window.addEventListener('load', function() {
|
1268 |
const frame = document.getElementById('plotFrame9');
|
1269 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1270 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1271 |
});
|
1272 |
+
</script> -->
|
1273 |
|
1274 |
<!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
|
1275 |
|
|
|
1363 |
|
1364 |
<p>In the <a target="_self" href="#tensor-parallelism">Tensor Parallelism</a> section we saw that trying to scale Tensor parallelism past the number of GPUs per single node (typically 4 or 8) hit a lower bandwidth network called “inter-node connection” which can quite strongly impair our performances. We can see this clearly on e.g. the all-reduce operation when we benchmark it on our cluster across several nodes (each node has 8 GPUs):</p>
|
1365 |
|
1366 |
+
<!-- <iframe class="l-body-outset" id="plotFrame11" src="assets/data/benchmarks/pp_comm_bandwidth.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1367 |
+
<div class="l-body-outset" id="fragment-pp_comm_bandwidth"></div>
|
1368 |
+
<!-- <script>
|
1369 |
window.addEventListener('load', function() {
|
1370 |
const frame = document.getElementById('plotFrame11');
|
1371 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1372 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1373 |
});
|
1374 |
+
</script> -->
|
1375 |
|
1376 |
<!-- <p><img alt="pp_comm_bandwidth.svg" src="/assets/images/pp_comm_bandwidth.svg" /></p> -->
|
1377 |
<p>Inter-node communication bandwidth measurements across different node counts, showing median (lines) and 5th-95th percentile ranges (shaded areas) for AllReduce, AllGather and ReduceScatter operations.</p>
|
|
|
1382 |
|
1383 |
<aside>This technique may remind you of our discussion on <a target="_self" href="#zero-redundancy-optimizer">ZeRO-3</a> where we split the model parameters across GPUs. We compare both techniques in details later in the <a target="_self" href="#5d_parallelism_in_a_nutshell">5D parallelism in a nutshell</a> section.</aside>
|
1384 |
|
1385 |
+
<!-- <iframe class="l-body" id="plotFrame12" src="assets/data/benchmarks/pp_memoryusage.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1386 |
+
<div class="l-body" id="fragment-pp_memoryusage"></div>
|
1387 |
+
<!-- <script>
|
1388 |
window.addEventListener('load', function() {
|
1389 |
const frame = document.getElementById('plotFrame12');
|
1390 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1391 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1392 |
});
|
1393 |
+
</script> -->
|
1394 |
<!-- <p><img alt="pp_memoryusage.svg" src="/assets/images/pp_memoryusage.svg" /></p> -->
|
1395 |
|
1396 |
<p>Looking at the figure above, we notice something interesting: while the model parameters are nicely split across GPUs, the activation memory remains the same on each GPU! This is because each GPU still needs to process the full batch of data, just with different layers. The activations from one GPU's layers will be sent to the next GPU to continue the forward pass.</p>
|
|
|
1516 |
|
1517 |
<p>So we can now decrease the bubble by adding microbatches and interleaved stages, but note that quantitatively, the amount of communication also increases by <d-math>v</d-math> so it’s a trade off. In the following plot you can see several configurations for a PP setup with <d-math>p=8</d-math>, where the special case of <d-math>m=1, v=1</d-math> corresponds to naive pipeline parallelism and the configurations with <d-math>v=1</d-math> are AFAB or 1F1B setups and <d-math>v \neq 1</d-math> are interleaved configurations.</p>
|
1518 |
|
1519 |
+
<!-- <iframe class="l-body" id="plotFrame23" src="assets/data/benchmarks/pp_bubblesize.html" width="90%" scrolling="no" frameborder="0"></iframe> -->
|
1520 |
+
<div class="l-body" id="fragment-pp_bubblesize"></div>
|
1521 |
+
<!-- <script>
|
1522 |
window.addEventListener('load', function() {
|
1523 |
const frame = document.getElementById('plotFrame23');
|
1524 |
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
1525 |
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
1526 |
});
|
1527 |
+
</script> -->
|
1528 |
<!-- <p><img alt="pp_bubblesize.png" src="/assets/images/pp_bubblesize.png" /></p> -->
|
1529 |
|
1530 |
|
|
|
2322 |
<p>We know that instability increases as learning rates rise for a fixed model size<d-cite bibtex-key="wortsman2023smallscaleproxieslargescaletransformer"></d-cite>, making FP8 pretraining particularly tricky.</p>
|
2323 |
|
2324 |
<iframe class="l-body-outset" id="plotFP8Loss" src="/assets/data/fp8/fp8_training_loss_curves.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
2325 |
+
<!-- Hynek uncomment this once it's added to -->
|
2326 |
+
<!-- <div class="l-body-outset" id="fragment-fp8_training_loss_curves"></div> -->
|
2327 |
|
2328 |
<p>The first, successful, very large scale training with FP8 mixed precision was publicly reported on DeepSeek-V3. The authors carefully analyzed each operation of the forward pass (Fprop) as well as the activation (Dgrad) and weight (Wgrad) backward pass. Similar to BF16 mixed precision training, some aggregation and master weights are kept in higher precision while the operations themselves are performed in FP8. </p>
|
2329 |
|
|
|
2742 |
<div>
|
2743 |
<a href="https://gordicaleksa.medium.com/eli5-flash-attention-5c44017022ad"><strong>Aleksa's ELI5 Flash Attention</strong></a>
|
2744 |
<p>Easy explanation of Flash Attention</p>
|
2745 |
+
</div>
|
|
|
2746 |
<div>
|
2747 |
<a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
|
2748 |
<p>Large-scale language modeling tutorials with PyTorch.</p>
|
|
|
3253 |
|
3254 |
</body>
|
3255 |
|
3256 |
+
</html>
|
webpack.config.js
CHANGED
@@ -130,6 +130,14 @@ module.exports = {
|
|
130 |
},
|
131 |
{
|
132 |
implementation: ImageMinimizerPlugin.svgoMinify,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
}
|
134 |
]
|
135 |
}),
|
|
|
130 |
},
|
131 |
{
|
132 |
implementation: ImageMinimizerPlugin.svgoMinify,
|
133 |
+
options: {
|
134 |
+
encodeOptions: {
|
135 |
+
multipass: true,
|
136 |
+
plugins: [
|
137 |
+
'preset-default',
|
138 |
+
]
|
139 |
+
}
|
140 |
+
}
|
141 |
}
|
142 |
]
|
143 |
}),
|