"use strict";(self.webpackChunkmy_website=self.webpackChunkmy_website||[]).push([[268],{7227:(e,n,t)=>{t.d(n,{A:()=>r});var l=t(96540),o=t(20053);const a={tabItem:"tabItem_Ymn6"};function r(e){let{children:n,hidden:t,className:r}=e;return l.createElement("div",{role:"tabpanel",className:(0,o.A)(a.tabItem,r),hidden:t},n)}},49489:(e,n,t)=>{t.d(n,{A:()=>A});var l=t(58168),o=t(96540),a=t(20053),r=t(24245),s=t(56347),i=t(62814),p=t(45167),m=t(81269);function g(e){return function(e){return o.Children.map(e,(e=>{if(!e||(0,o.isValidElement)(e)&&function(e){const{props:n}=e;return!!n&&"object"==typeof n&&"value"in n}(e))return e;throw new Error(`Docusaurus error: Bad <Tabs> child <${"string"==typeof e.type?e.type:e.type.name}>: all children of the <Tabs> component should be <TabItem>, and every <TabItem> should have a unique "value" prop.`)}))?.filter(Boolean)??[]}(e).map((e=>{let{props:{value:n,label:t,attributes:l,default:o}}=e;return{value:n,label:t,attributes:l,default:o}}))}function u(e){const{values:n,children:t}=e;return(0,o.useMemo)((()=>{const e=n??g(t);return function(e){const n=(0,p.X)(e,((e,n)=>e.value===n.value));if(n.length>0)throw new Error(`Docusaurus error: Duplicate values "${n.map((e=>e.value)).join(", ")}" found in <Tabs>. Every value needs to be unique.`)}(e),e}),[n,t])}function d(e){let{value:n,tabValues:t}=e;return t.some((e=>e.value===n))}function c(e){let{queryString:n=!1,groupId:t}=e;const l=(0,s.W6)(),a=function(e){let{queryString:n=!1,groupId:t}=e;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!t)throw new Error('Docusaurus error: The <Tabs> component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return t??null}({queryString:n,groupId:t});return[(0,i.aZ)(a),(0,o.useCallback)((e=>{if(!a)return;const n=new URLSearchParams(l.location.search);n.set(a,e),l.replace({...l.location,search:n.toString()})}),[a,l])]}function y(e){const{defaultValue:n,queryString:t=!1,groupId:l}=e,a=u(e),[r,s]=(0,o.useState)((()=>function(e){let{defaultValue:n,tabValues:t}=e;if(0===t.length)throw new Error("Docusaurus error: the <Tabs> component requires at least one <TabItem> children component");if(n){if(!d({value:n,tabValues:t}))throw new Error(`Docusaurus error: The <Tabs> has a defaultValue "${n}" but none of its children has the corresponding value. Available values are: ${t.map((e=>e.value)).join(", ")}. If you intend to show no default tab, use defaultValue={null} instead.`);return n}const l=t.find((e=>e.default))??t[0];if(!l)throw new Error("Unexpected error: 0 tabValues");return l.value}({defaultValue:n,tabValues:a}))),[i,p]=c({queryString:t,groupId:l}),[g,y]=function(e){let{groupId:n}=e;const t=function(e){return e?`docusaurus.tab.${e}`:null}(n),[l,a]=(0,m.Dv)(t);return[l,(0,o.useCallback)((e=>{t&&a.set(e)}),[t,a])]}({groupId:l}),_=(()=>{const e=i??g;return d({value:e,tabValues:a})?e:null})();(0,o.useLayoutEffect)((()=>{_&&s(_)}),[_]);return{selectedValue:r,selectValue:(0,o.useCallback)((e=>{if(!d({value:e,tabValues:a}))throw new Error(`Can't select invalid tab value=${e}`);s(e),p(e),y(e)}),[p,y,a]),tabValues:a}}var _=t(11062);const h={tabList:"tabList__CuJ",tabItem:"tabItem_LNqP"};function b(e){let{className:n,block:t,selectedValue:s,selectValue:i,tabValues:p}=e;const m=[],{blockElementScrollPositionUntilNextRender:g}=(0,r.a_)(),u=e=>{const n=e.currentTarget,t=m.indexOf(n),l=p[t].value;l!==s&&(g(n),i(l))},d=e=>{let n=null;switch(e.key){case"Enter":u(e);break;case"ArrowRight":{const t=m.indexOf(e.currentTarget)+1;n=m[t]??m[0];break}case"ArrowLeft":{const t=m.indexOf(e.currentTarget)-1;n=m[t]??m[m.length-1];break}}n?.focus()};return o.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,a.A)("tabs",{"tabs--block":t},n)},p.map((e=>{let{value:n,label:t,attributes:r}=e;return o.createElement("li",(0,l.A)({role:"tab",tabIndex:s===n?0:-1,"aria-selected":s===n,key:n,ref:e=>m.push(e),onKeyDown:d,onClick:u},r,{className:(0,a.A)("tabs__item",h.tabItem,r?.className,{"tabs__item--active":s===n})}),t??n)})))}function f(e){let{lazy:n,children:t,selectedValue:l}=e;const a=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){const e=a.find((e=>e.props.value===l));return e?(0,o.cloneElement)(e,{className:"margin-top--md"}):null}return o.createElement("div",{className:"margin-top--md"},a.map(((e,n)=>(0,o.cloneElement)(e,{key:n,hidden:e.props.value!==l}))))}function v(e){const n=y(e);return o.createElement("div",{className:(0,a.A)("tabs-container",h.tabList)},o.createElement(b,(0,l.A)({},e,n)),o.createElement(f,(0,l.A)({},e,n)))}function A(e){const n=(0,_.A)();return o.createElement(v,(0,l.A)({key:String(n)},e))}},1455:(e,n,t)=>{t.r(n),t.d(n,{assets:()=>m,contentTitle:()=>i,default:()=>c,frontMatter:()=>s,metadata:()=>p,toc:()=>g});var l=t(58168),o=(t(96540),t(15680)),a=(t(13012),t(49489)),r=t(7227);const s={},i="Router - Load Balancing",p={unversionedId:"routing",id:"routing",title:"Router - Load Balancing",description:"LiteLLM manages:",source:"@site/docs/routing.md",sourceDirName:".",slug:"/routing",permalink:"/docs/routing",draft:!1,tags:[],version:"current",frontMatter:{},sidebar:"tutorialSidebar",previous:{title:"Routing, Loadbalancing & Fallbacks",permalink:"/docs/routing-load-balancing"},next:{title:"[BETA] Request Prioritization",permalink:"/docs/scheduler"}},m={},g=[{value:"Load Balancing",id:"load-balancing",level:2},{value:"Quick Start",id:"quick-start",level:3},{value:"Available Endpoints",id:"available-endpoints",level:3},{value:"Advanced - Routing Strategies \u2b50\ufe0f",id:"advanced---routing-strategies-\ufe0f",level:2},{value:"Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based",id:"routing-strategies---weighted-pick-rate-limit-aware-least-busy-latency-based-cost-based",level:4},{value:"Set Time Window",id:"set-time-window",level:4},{value:"Set Lowest Latency Buffer",id:"set-lowest-latency-buffer",level:4},{value:"<strong>LiteLLM Proxy Config.yaml</strong>",id:"litellm-proxy-configyaml",level:5},{value:"<strong>Python SDK</strong>",id:"python-sdk",level:5},{value:"<strong>LiteLLM Proxy Config.yaml</strong>",id:"litellm-proxy-configyaml-1",level:5},{value:"<strong>Python SDK</strong>",id:"python-sdk-1",level:5},{value:"Using Custom Input/Output pricing",id:"using-custom-inputoutput-pricing",level:4},{value:"Basic Reliability",id:"basic-reliability",level:2},{value:"Weighted Deployments",id:"weighted-deployments",level:3},{value:"Max Parallel Requests (ASYNC)",id:"max-parallel-requests-async",level:3},{value:"Cooldowns",id:"cooldowns",level:3},{value:"<strong>Disable cooldowns</strong>",id:"disable-cooldowns",level:4},{value:"Retries",id:"retries",level:3},{value:"Advanced: Custom Retries, Cooldowns based on Error Type",id:"advanced-custom-retries-cooldowns-based-on-error-type",level:3},{value:"Caching",id:"caching",level:3},{value:"Pre-Call Checks (Context Window, EU-Regions)",id:"pre-call-checks-context-window-eu-regions",level:2},{value:"Caching across model groups",id:"caching-across-model-groups",level:2},{value:"Alerting \ud83d\udea8",id:"alerting-",level:2},{value:"Usage",id:"usage",level:4},{value:"Track cost for Azure Deployments",id:"track-cost-for-azure-deployments",level:2},{value:"Default litellm.completion/embedding params",id:"default-litellmcompletionembedding-params",level:4},{value:"Custom Callbacks - Track API Key, API Endpoint, Model Used",id:"custom-callbacks---track-api-key-api-endpoint-model-used",level:2},{value:"Usage",id:"usage-1",level:3},{value:"Deploy Router",id:"deploy-router",level:2},{value:"Debugging Router",id:"debugging-router",level:2},{value:"Basic Debugging",id:"basic-debugging",level:3},{value:"Detailed Debugging",id:"detailed-debugging",level:3},{value:"Very Detailed Debugging",id:"very-detailed-debugging",level:3},{value:"Router General Settings",id:"router-general-settings",level:2},{value:"Usage",id:"usage-2",level:3},{value:"Spec",id:"spec",level:3}],u={toc:g},d="wrapper";function c(e){let{components:n,...t}=e;return(0,o.yg)(d,(0,l.A)({},u,t,{components:n,mdxType:"MDXLayout"}),(0,o.yg)("h1",{id:"router---load-balancing"},"Router - Load Balancing"),(0,o.yg)("p",null,"LiteLLM manages:"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"Load-balance across multiple deployments (e.g. Azure/OpenAI)"),(0,o.yg)("li",{parentName:"ul"},"Prioritizing important requests to ensure they don't fail (i.e. Queueing)"),(0,o.yg)("li",{parentName:"ul"},"Basic reliability logic - cooldowns, fallbacks, timeouts and retries (fixed + exponential backoff) across multiple deployments/providers.")),(0,o.yg)("p",null,"In production, litellm supports using Redis as a way to track cooldown server and usage (managing tpm/rpm limits)."),(0,o.yg)("admonition",{type:"info"},(0,o.yg)("p",{parentName:"admonition"},"If you want a server to load balance across different LLM APIs, use our ",(0,o.yg)("a",{parentName:"p",href:"/docs/proxy/load_balancing"},"LiteLLM Proxy Server"))),(0,o.yg)("h2",{id:"load-balancing"},"Load Balancing"),(0,o.yg)("p",null,"(s/o ",(0,o.yg)("a",{parentName:"p",href:"https://www.linkedin.com/in/paulpierre/"},"@paulpierre")," and ",(0,o.yg)("a",{parentName:"p",href:"https://docs.sweep.dev/blogs/openai-proxy"},"sweep proxy")," for their contributions to this implementation)\n",(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/main/litellm/router.py"},(0,o.yg)("strong",{parentName:"a"},"See Code"))),(0,o.yg)("h3",{id:"quick-start"},"Quick Start"),(0,o.yg)("p",null,"Loadbalance across multiple ",(0,o.yg)("a",{parentName:"p",href:"/docs/providers/azure"},"azure"),"/",(0,o.yg)("a",{parentName:"p",href:"/docs/providers/bedrock"},"bedrock"),"/",(0,o.yg)("a",{parentName:"p",href:"./providers/"},"provider")," deployments. LiteLLM will handle retrying in different regions if a call fails."),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"SDK",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nmodel_list = [{ # list of model deployments \n    "model_name": "gpt-3.5-turbo", # model alias -> loadbalance between models with same `model_name`\n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-v-2", # actual model name\n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE")\n    }\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-functioncalling", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE")\n    }\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "gpt-3.5-turbo", \n        "api_key": os.getenv("OPENAI_API_KEY"),\n    }\n}, {\n    "model_name": "gpt-4", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/gpt-4", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n    }\n}, {\n    "model_name": "gpt-4", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "gpt-4", \n        "api_key": os.getenv("OPENAI_API_KEY"),\n    }\n},\n\n]\n\nrouter = Router(model_list=model_list)\n\n# openai.ChatCompletion.create replacement\n# requests with model="gpt-3.5-turbo" will pick a deployment where model_name="gpt-3.5-turbo"\nresponse = await router.acompletion(model="gpt-3.5-turbo", \n                messages=[{"role": "user", "content": "Hey, how\'s it going?"}])\n\nprint(response)\n\n# openai.ChatCompletion.create replacement\n# requests with model="gpt-4" will pick a deployment where model_name="gpt-4"\nresponse = await router.acompletion(model="gpt-4", \n                messages=[{"role": "user", "content": "Hey, how\'s it going?"}])\n\nprint(response)\n'))),(0,o.yg)(r.A,{value:"proxy",label:"PROXY",mdxType:"TabItem"},(0,o.yg)("admonition",{type:"info"},(0,o.yg)("p",{parentName:"admonition"},"See detailed proxy loadbalancing/fallback docs ",(0,o.yg)("a",{parentName:"p",href:"/docs/proxy/reliability"},"here"))),(0,o.yg)("ol",null,(0,o.yg)("li",{parentName:"ol"},"Setup model_list with multiple deployments")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"model_list:\n  - model_name: gpt-3.5-turbo\n    litellm_params:\n      model: azure/<your-deployment-name>\n      api_base: <your-azure-endpoint>\n      api_key: <your-azure-api-key>\n  - model_name: gpt-3.5-turbo\n    litellm_params:\n      model: azure/gpt-turbo-small-ca\n      api_base: https://my-endpoint-canada-berri992.openai.azure.com/\n      api_key: <your-azure-api-key>\n  - model_name: gpt-3.5-turbo\n    litellm_params:\n      model: azure/gpt-turbo-large\n      api_base: https://openai-france-1234.openai.azure.com/\n      api_key: <your-azure-api-key>\n")),(0,o.yg)("ol",{start:2},(0,o.yg)("li",{parentName:"ol"},"Start proxy ")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-bash"},"litellm --config /path/to/config.yaml \n")),(0,o.yg)("ol",{start:3},(0,o.yg)("li",{parentName:"ol"},"Test it! ")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-bash"},'curl -X POST \'http://0.0.0.0:4000/chat/completions\' \\\n-H \'Content-Type: application/json\' \\\n-H \'Authorization: Bearer sk-1234\' \\\n-d \'{\n  "model": "gpt-3.5-turbo",\n  "messages": [\n        {"role": "user", "content": "Hi there!"}\n    ],\n    "mock_testing_rate_limit_error": true\n}\'\n')))),(0,o.yg)("h3",{id:"available-endpoints"},"Available Endpoints"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.completion()")," - chat completions endpoint to call 100+ LLMs"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.acompletion()")," - async chat completion calls"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.embedding()")," - embedding endpoint for Azure, OpenAI, Huggingface endpoints"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.aembedding()")," - async embeddings calls"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.text_completion()")," - completion calls in the old OpenAI ",(0,o.yg)("inlineCode",{parentName:"li"},"/v1/completions")," endpoint format"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.atext_completion()")," - async text completion calls"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.image_generation()")," - completion calls in OpenAI ",(0,o.yg)("inlineCode",{parentName:"li"},"/v1/images/generations")," endpoint format"),(0,o.yg)("li",{parentName:"ul"},(0,o.yg)("inlineCode",{parentName:"li"},"router.aimage_generation()")," - async image generation calls")),(0,o.yg)("h2",{id:"advanced---routing-strategies-\ufe0f"},"Advanced - Routing Strategies \u2b50\ufe0f"),(0,o.yg)("h4",{id:"routing-strategies---weighted-pick-rate-limit-aware-least-busy-latency-based-cost-based"},"Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based"),(0,o.yg)("p",null,"Router provides 4 strategies for routing your calls across multiple deployments: "),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"usage-based-v2",label:"Rate-Limit Aware v2 (ASYNC)",mdxType:"TabItem"},(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"\ud83c\udf89 NEW")," This is an async implementation of usage-based-routing."),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Filters out deployment if tpm/rpm limit exceeded")," - If you pass in the deployment's tpm/rpm limits."),(0,o.yg)("p",null,"Routes to ",(0,o.yg)("strong",{parentName:"p"},"deployment with lowest TPM usage")," for that minute. "),(0,o.yg)("p",null,"In production, we use Redis to track usage (TPM/RPM) across multiple deployments. This implementation uses ",(0,o.yg)("strong",{parentName:"p"},"async redis calls")," (redis.incr and redis.mget)."),(0,o.yg)("p",null,"For Azure, ",(0,o.yg)("a",{parentName:"p",href:"https://stackoverflow.com/questions/77368844/what-is-the-request-per-minute-rate-limit-for-azure-openai-models-for-gpt-3-5-tu"},"you get 6 RPM per 1000 TPM")),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"sdk",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \n\n\nmodel_list = [{ # list of model deployments \n    "model_name": "gpt-3.5-turbo", # model alias \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-v-2", # actual model name\n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE")\n        "tpm": 100000,\n        "rpm": 10000,\n    }, \n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-functioncalling", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE")\n        "tpm": 100000,\n        "rpm": 1000,\n    },\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "gpt-3.5-turbo", \n        "api_key": os.getenv("OPENAI_API_KEY"),\n        "tpm": 100000,\n        "rpm": 1000,\n    },\n}]\nrouter = Router(model_list=model_list, \n                redis_host=os.environ["REDIS_HOST"], \n                redis_password=os.environ["REDIS_PASSWORD"], \n                redis_port=os.environ["REDIS_PORT"], \n                routing_strategy="usage-based-routing-v2" # \ud83d\udc48 KEY CHANGE\n                enable_pre_call_checks=True, # enables router rate limits for concurrent calls\n                )\n\nresponse = await router.acompletion(model="gpt-3.5-turbo", \n                messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n\nprint(response)\n'))),(0,o.yg)(r.A,{value:"proxy",label:"proxy",mdxType:"TabItem"},(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"1. Set strategy in config")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"model_list:\n    - model_name: gpt-3.5-turbo # model alias \n      litellm_params: # params for litellm completion/embedding call \n        model: azure/chatgpt-v-2 # actual model name\n        api_key: os.environ/AZURE_API_KEY\n        api_version: os.environ/AZURE_API_VERSION\n        api_base: os.environ/AZURE_API_BASE\n      tpm: 100000\n      rpm: 10000\n    - model_name: gpt-3.5-turbo \n      litellm_params: # params for litellm completion/embedding call \n        model: gpt-3.5-turbo \n        api_key: os.getenv(OPENAI_API_KEY)\n      tpm: 100000\n      rpm: 1000\n\nrouter_settings:\n  routing_strategy: usage-based-routing-v2 # \ud83d\udc48 KEY CHANGE\n  redis_host: <your-redis-host>\n  redis_password: <your-redis-password>\n  redis_port: <your-redis-port>\n  enable_pre_call_check: true\n\ngeneral_settings:\n  master_key: sk-1234\n")),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"2. Start proxy")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-bash"},"litellm --config /path/to/config.yaml\n")),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"3. Test it!")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-bash"},'curl --location \'http://localhost:4000/v1/chat/completions\' \\\n--header \'Content-Type: application/json\' \\\n--header \'Authorization: Bearer sk-1234\' \\\n--data \'{\n    "model": "gpt-3.5-turbo", \n    "messages": [{"role": "user", "content": "Hey, how\'s it going?"}]\n}\'\n'))))),(0,o.yg)(r.A,{value:"latency-based",label:"Latency-Based",mdxType:"TabItem"},(0,o.yg)("p",null,"Picks the deployment with the lowest response time."),(0,o.yg)("p",null,"It caches, and updates the response times for deployments based on when a request was sent and received from a deployment."),(0,o.yg)("p",null,(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/main/tests/local_testing/test_lowest_latency_routing.py"},(0,o.yg)("strong",{parentName:"a"},"How to test"))),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \nimport asyncio\n\nmodel_list = [{ ... }]\n\n# init router\nrouter = Router(model_list=model_list,\n                routing_strategy="latency-based-routing",# \ud83d\udc48 set routing strategy\n                enable_pre_call_check=True, # enables router rate limits for concurrent calls\n                )\n\n## CALL 1+2\ntasks = []\nresponse = None\nfinal_response = None\nfor _ in range(2):\n    tasks.append(router.acompletion(model=model, messages=messages))\nresponse = await asyncio.gather(*tasks)\n\nif response is not None:\n    ## CALL 3 \n    await asyncio.sleep(1)  # let the cache update happen\n    picked_deployment = router.lowestlatency_logger.get_available_deployments(\n        model_group=model, healthy_deployments=router.healthy_deployments\n    )\n    final_response = await router.acompletion(model=model, messages=messages)\n    print(f"min deployment id: {picked_deployment}")\n    print(f"model id: {final_response._hidden_params[\'model_id\']}")\n    assert (\n        final_response._hidden_params["model_id"]\n        == picked_deployment["model_info"]["id"]\n    )\n')),(0,o.yg)("h4",{id:"set-time-window"},"Set Time Window"),(0,o.yg)("p",null,"Set time window for how far back to consider when averaging latency for a deployment. "),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"In Router")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'router = Router(..., routing_strategy_args={"ttl": 10})\n')),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"In Proxy")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},'router_settings:\n    routing_strategy_args: {"ttl": 10}\n')),(0,o.yg)("h4",{id:"set-lowest-latency-buffer"},"Set Lowest Latency Buffer"),(0,o.yg)("p",null,"Set a buffer within which deployments are candidates for making calls to. "),(0,o.yg)("p",null,"E.g. "),(0,o.yg)("p",null,"if you have 5 deployments"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"https://litellm-prod-1.openai.azure.com/: 0.07s\nhttps://litellm-prod-2.openai.azure.com/: 0.1s\nhttps://litellm-prod-3.openai.azure.com/: 0.1s\nhttps://litellm-prod-4.openai.azure.com/: 0.1s\nhttps://litellm-prod-5.openai.azure.com/: 4.66s\n")),(0,o.yg)("p",null,"to prevent initially overloading ",(0,o.yg)("inlineCode",{parentName:"p"},"prod-1"),", with all requests - we can set a buffer of 50%, to consider deployments ",(0,o.yg)("inlineCode",{parentName:"p"},"prod-2, prod-3, prod-4"),". "),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"In Router")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})\n')),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"In Proxy")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},'router_settings:\n    routing_strategy_args: {"lowest_latency_buffer": 0.5}\n'))),(0,o.yg)(r.A,{value:"simple-shuffle",label:"(Default) Weighted Pick (Async)",mdxType:"TabItem"},(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Default")," Picks a deployment based on the provided ",(0,o.yg)("strong",{parentName:"p"},"Requests per minute (rpm) or Tokens per minute (tpm)")),(0,o.yg)("p",null,"If ",(0,o.yg)("inlineCode",{parentName:"p"},"rpm")," or ",(0,o.yg)("inlineCode",{parentName:"p"},"tpm")," is not provided, it randomly picks a deployment"),(0,o.yg)("p",null,"You can also set a ",(0,o.yg)("inlineCode",{parentName:"p"},"weight")," param, to specify which model should get picked when."),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"rpm",label:"RPM-based shuffling",mdxType:"TabItem"},(0,o.yg)("h5",{id:"litellm-proxy-configyaml"},(0,o.yg)("strong",{parentName:"h5"},"LiteLLM Proxy Config.yaml")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"model_list:\n    - model_name: gpt-3.5-turbo\n      litellm_params:\n        model: azure/chatgpt-v-2\n        api_key: os.environ/AZURE_API_KEY\n        api_version: os.environ/AZURE_API_VERSION\n        api_base: os.environ/AZURE_API_BASE\n        rpm: 900 \n    - model_name: gpt-3.5-turbo\n      litellm_params:\n        model: azure/chatgpt-functioncalling\n        api_key: os.environ/AZURE_API_KEY\n        api_version: os.environ/AZURE_API_VERSION\n        api_base: os.environ/AZURE_API_BASE\n        rpm: 10 \n")),(0,o.yg)("h5",{id:"python-sdk"},(0,o.yg)("strong",{parentName:"h5"},"Python SDK")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \nimport asyncio\n\nmodel_list = [{ # list of model deployments \n    "model_name": "gpt-3.5-turbo", # model alias \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-v-2", # actual model name\n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n        "rpm": 900,         # requests per minute for this API\n    }\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-functioncalling", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n        "rpm": 10,\n    }\n},]\n\n# init router\nrouter = Router(model_list=model_list, routing_strategy="simple-shuffle")\nasync def router_acompletion():\n    response = await router.acompletion(\n        model="gpt-3.5-turbo", \n        messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n    )\n    print(response)\n    return response\n\nasyncio.run(router_acompletion())\n'))),(0,o.yg)(r.A,{value:"weight",label:"Weight-based shuffling",mdxType:"TabItem"},(0,o.yg)("h5",{id:"litellm-proxy-configyaml-1"},(0,o.yg)("strong",{parentName:"h5"},"LiteLLM Proxy Config.yaml")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"model_list:\n    - model_name: gpt-3.5-turbo\n      litellm_params:\n        model: azure/chatgpt-v-2\n        api_key: os.environ/AZURE_API_KEY\n        api_version: os.environ/AZURE_API_VERSION\n        api_base: os.environ/AZURE_API_BASE\n        weight: 9\n    - model_name: gpt-3.5-turbo\n      litellm_params:\n        model: azure/chatgpt-functioncalling\n        api_key: os.environ/AZURE_API_KEY\n        api_version: os.environ/AZURE_API_VERSION\n        api_base: os.environ/AZURE_API_BASE\n        weight: 1 \n")),(0,o.yg)("h5",{id:"python-sdk-1"},(0,o.yg)("strong",{parentName:"h5"},"Python SDK")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \nimport asyncio\n\nmodel_list = [{\n    "model_name": "gpt-3.5-turbo", # model alias \n    "litellm_params": { \n        "model": "azure/chatgpt-v-2", # actual model name\n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n        "weight": 9, # pick this 90% of the time\n    }\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { \n        "model": "azure/chatgpt-functioncalling", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n        "weight": 1,\n    }\n}]\n\n# init router\nrouter = Router(model_list=model_list, routing_strategy="simple-shuffle")\nasync def router_acompletion():\n    response = await router.acompletion(\n        model="gpt-3.5-turbo", \n        messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n    )\n    print(response)\n    return response\n\nasyncio.run(router_acompletion())\n'))))),(0,o.yg)(r.A,{value:"usage-based",label:"Rate-Limit Aware",mdxType:"TabItem"},(0,o.yg)("p",null,"This will route to the deployment with the lowest TPM usage for that minute. "),(0,o.yg)("p",null,"In production, we use Redis to track usage (TPM/RPM) across multiple deployments. "),(0,o.yg)("p",null,"If you pass in the deployment's tpm/rpm limits, this will also check against that, and filter out any who's limits would be exceeded. "),(0,o.yg)("p",null,"For Azure, your RPM = TPM/6. "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \n\n\nmodel_list = [{ # list of model deployments \n    "model_name": "gpt-3.5-turbo", # model alias \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-v-2", # actual model name\n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE")\n    }, \n    "tpm": 100000,\n    "rpm": 10000,\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-functioncalling", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE")\n    },\n    "tpm": 100000,\n    "rpm": 1000,\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "gpt-3.5-turbo", \n        "api_key": os.getenv("OPENAI_API_KEY"),\n    },\n    "tpm": 100000,\n    "rpm": 1000,\n}]\nrouter = Router(model_list=model_list, \n                redis_host=os.environ["REDIS_HOST"], \n                redis_password=os.environ["REDIS_PASSWORD"], \n                redis_port=os.environ["REDIS_PORT"], \n                routing_strategy="usage-based-routing"\n                enable_pre_call_check=True, # enables router rate limits for concurrent calls\n                )\n\nresponse = await router.acompletion(model="gpt-3.5-turbo", \n                messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n\nprint(response)\n'))),(0,o.yg)(r.A,{value:"least-busy",label:"Least-Busy",mdxType:"TabItem"},(0,o.yg)("p",null,"Picks a deployment with the least number of ongoing calls, it's handling."),(0,o.yg)("p",null,(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/main/tests/local_testing/test_least_busy_routing.py"},(0,o.yg)("strong",{parentName:"a"},"How to test"))),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \nimport asyncio\n\nmodel_list = [{ # list of model deployments \n    "model_name": "gpt-3.5-turbo", # model alias \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-v-2", # actual model name\n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n    }\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "azure/chatgpt-functioncalling", \n        "api_key": os.getenv("AZURE_API_KEY"),\n        "api_version": os.getenv("AZURE_API_VERSION"),\n        "api_base": os.getenv("AZURE_API_BASE"),\n    }\n}, {\n    "model_name": "gpt-3.5-turbo", \n    "litellm_params": { # params for litellm completion/embedding call \n        "model": "gpt-3.5-turbo", \n        "api_key": os.getenv("OPENAI_API_KEY"),\n    }\n}]\n\n# init router\nrouter = Router(model_list=model_list, routing_strategy="least-busy")\nasync def router_acompletion():\n    response = await router.acompletion(\n        model="gpt-3.5-turbo", \n        messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n    )\n    print(response)\n    return response\n\nasyncio.run(router_acompletion())\n'))),(0,o.yg)(r.A,{value:"custom",label:"Custom Routing Strategy",mdxType:"TabItem"},(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Plugin a custom routing strategy to select deployments")),(0,o.yg)("p",null,"Step 1. Define your custom routing strategy"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'\nfrom litellm.router import CustomRoutingStrategyBase\nclass CustomRoutingStrategy(CustomRoutingStrategyBase):\n    async def async_get_available_deployment(\n        self,\n        model: str,\n        messages: Optional[List[Dict[str, str]]] = None,\n        input: Optional[Union[str, List]] = None,\n        specific_deployment: Optional[bool] = False,\n        request_kwargs: Optional[Dict] = None,\n    ):\n        """\n        Asynchronously retrieves the available deployment based on the given parameters.\n\n        Args:\n            model (str): The name of the model.\n            messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.\n            input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.\n            specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.\n            request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.\n\n        Returns:\n            Returns an element from litellm.router.model_list\n\n        """\n        print("In CUSTOM async get available deployment")\n        model_list = router.model_list\n        print("router model list=", model_list)\n        for model in model_list:\n            if isinstance(model, dict):\n                if model["litellm_params"]["model"] == "openai/very-special-endpoint":\n                    return model\n        pass\n\n    def get_available_deployment(\n        self,\n        model: str,\n        messages: Optional[List[Dict[str, str]]] = None,\n        input: Optional[Union[str, List]] = None,\n        specific_deployment: Optional[bool] = False,\n        request_kwargs: Optional[Dict] = None,\n    ):\n        """\n        Synchronously retrieves the available deployment based on the given parameters.\n\n        Args:\n            model (str): The name of the model.\n            messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.\n            input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.\n            specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.\n            request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.\n\n        Returns:\n            Returns an element from litellm.router.model_list\n\n        """\n        pass\n')),(0,o.yg)("p",null,"Step 2. Initialize Router with custom routing strategy"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nrouter = Router(\n    model_list=[\n        {\n            "model_name": "azure-model",\n            "litellm_params": {\n                "model": "openai/very-special-endpoint",\n                "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",  # If you are Krrish, this is OpenAI Endpoint3 on our Railway endpoint :)\n                "api_key": "fake-key",\n            },\n            "model_info": {"id": "very-special-endpoint"},\n        },\n        {\n            "model_name": "azure-model",\n            "litellm_params": {\n                "model": "openai/fast-endpoint",\n                "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",\n                "api_key": "fake-key",\n            },\n            "model_info": {"id": "fast-endpoint"},\n        },\n    ],\n    set_verbose=True,\n    debug_level="DEBUG",\n    timeout=1,\n)  # type: ignore\n\nrouter.set_custom_routing_strategy(CustomRoutingStrategy()) # \ud83d\udc48 Set your routing strategy here\n')),(0,o.yg)("p",null,"Step 3. Test your routing strategy. Expect your custom routing strategy to be called when running ",(0,o.yg)("inlineCode",{parentName:"p"},"router.acompletion")," requests"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'for _ in range(10):\n    response = await router.acompletion(\n        model="azure-model", messages=[{"role": "user", "content": "hello"}]\n    )\n    print(response)\n    _picked_model_id = response._hidden_params["model_id"]\n    print("picked model=", _picked_model_id)\n'))),(0,o.yg)(r.A,{value:"lowest-cost",label:"Lowest Cost Routing (Async)",mdxType:"TabItem"},(0,o.yg)("p",null,"Picks a deployment based on the lowest cost"),(0,o.yg)("p",null,"How this works:"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"Get all healthy deployments"),(0,o.yg)("li",{parentName:"ul"},"Select all deployments that are under their provided ",(0,o.yg)("inlineCode",{parentName:"li"},"rpm/tpm")," limits"),(0,o.yg)("li",{parentName:"ul"},"For each deployment check if ",(0,o.yg)("inlineCode",{parentName:"li"},'litellm_param["model"]')," exists in ",(0,o.yg)("a",{parentName:"li",href:"https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"},(0,o.yg)("inlineCode",{parentName:"a"},"litellm_model_cost_map"))," ",(0,o.yg)("ul",{parentName:"li"},(0,o.yg)("li",{parentName:"ul"},"if deployment does not exist in ",(0,o.yg)("inlineCode",{parentName:"li"},"litellm_model_cost_map")," -> use deployment_cost= ",(0,o.yg)("inlineCode",{parentName:"li"},"$1")))),(0,o.yg)("li",{parentName:"ul"},"Select deployment with lowest cost")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \nimport asyncio\n\nmodel_list =  [\n    {\n        "model_name": "gpt-3.5-turbo",\n        "litellm_params": {"model": "gpt-4"},\n        "model_info": {"id": "openai-gpt-4"},\n    },\n    {\n        "model_name": "gpt-3.5-turbo",\n        "litellm_params": {"model": "groq/llama3-8b-8192"},\n        "model_info": {"id": "groq-llama"},\n    },\n]\n\n# init router\nrouter = Router(model_list=model_list, routing_strategy="cost-based-routing")\nasync def router_acompletion():\n    response = await router.acompletion(\n        model="gpt-3.5-turbo", \n        messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n    )\n    print(response)\n\n    print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost\n    return response\n\nasyncio.run(router_acompletion())\n\n')),(0,o.yg)("h4",{id:"using-custom-inputoutput-pricing"},"Using Custom Input/Output pricing"),(0,o.yg)("p",null,"Set ",(0,o.yg)("inlineCode",{parentName:"p"},'litellm_params["input_cost_per_token"]')," and ",(0,o.yg)("inlineCode",{parentName:"p"},'litellm_params["output_cost_per_token"]')," for using custom pricing when routing"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'model_list = [\n    {\n        "model_name": "gpt-3.5-turbo",\n        "litellm_params": {\n            "model": "azure/chatgpt-v-2",\n            "input_cost_per_token": 0.00003,\n            "output_cost_per_token": 0.00003,\n        },\n        "model_info": {"id": "chatgpt-v-experimental"},\n    },\n    {\n        "model_name": "gpt-3.5-turbo",\n        "litellm_params": {\n            "model": "azure/chatgpt-v-1",\n            "input_cost_per_token": 0.000000001,\n            "output_cost_per_token": 0.00000001,\n        },\n        "model_info": {"id": "chatgpt-v-1"},\n    },\n    {\n        "model_name": "gpt-3.5-turbo",\n        "litellm_params": {\n            "model": "azure/chatgpt-v-5",\n            "input_cost_per_token": 10,\n            "output_cost_per_token": 12,\n        },\n        "model_info": {"id": "chatgpt-v-5"},\n    },\n]\n# init router\nrouter = Router(model_list=model_list, routing_strategy="cost-based-routing")\nasync def router_acompletion():\n    response = await router.acompletion(\n        model="gpt-3.5-turbo", \n        messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n    )\n    print(response)\n\n    print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost\n    return response\n\nasyncio.run(router_acompletion())\n')))),(0,o.yg)("h2",{id:"basic-reliability"},"Basic Reliability"),(0,o.yg)("h3",{id:"weighted-deployments"},"Weighted Deployments"),(0,o.yg)("p",null,"Set ",(0,o.yg)("inlineCode",{parentName:"p"},"weight")," on a deployment to pick one deployment more often than others. "),(0,o.yg)("p",null,"This works across ",(0,o.yg)("strong",{parentName:"p"},"simple-shuffle")," routing strategy (this is the default, if no routing strategy is selected). "),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"SDK",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \n\nmodel_list = [\n    {\n        "model_name": "o1",\n        "litellm_params": {\n            "model": "o1-preview", \n            "api_key": os.getenv("OPENAI_API_KEY"), \n            "weight": 1\n        },\n    },\n    {\n        "model_name": "o1",\n        "litellm_params": {\n            "model": "o1-preview", \n            "api_key": os.getenv("OPENAI_API_KEY"), \n            "weight": 2 # \ud83d\udc48 PICK THIS DEPLOYMENT 2x MORE OFTEN THAN o1-preview\n        },\n    },\n]\n\nrouter = Router(model_list=model_list, routing_strategy="cost-based-routing")\n\nresponse = await router.acompletion(\n    model="gpt-3.5-turbo", \n    messages=[{"role": "user", "content": "Hey, how\'s it going?"}]\n)\nprint(response)\n'))),(0,o.yg)(r.A,{value:"proxy",label:"PROXY",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"model_list:\n  - model_name: o1\n    litellm_params:\n        model: o1\n        api_key: os.environ/OPENAI_API_KEY\n        weight: 1   \n  - model_name: o1\n    litellm_params:\n        model: o1-preview\n        api_key: os.environ/OPENAI_API_KEY\n        weight: 2 # \ud83d\udc48 PICK THIS DEPLOYMENT 2x MORE OFTEN THAN o1-preview\n")))),(0,o.yg)("h3",{id:"max-parallel-requests-async"},"Max Parallel Requests (ASYNC)"),(0,o.yg)("p",null,"Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios. "),(0,o.yg)("p",null,"If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit. "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router \n\nmodel_list = [{\n    "model_name": "gpt-4",\n    "litellm_params": {\n        "model": "azure/gpt-4",\n        ...\n        "max_parallel_requests": 10 # \ud83d\udc48 SET PER DEPLOYMENT\n    }\n}]\n\n### OR ### \n\nrouter = Router(model_list=model_list, default_max_parallel_requests=20) # \ud83d\udc48 SET DEFAULT MAX PARALLEL REQUESTS \n\n\n# deployment max parallel requests > default max parallel requests\n')),(0,o.yg)("p",null,(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605"},(0,o.yg)("strong",{parentName:"a"},"See Code"))),(0,o.yg)("h3",{id:"cooldowns"},"Cooldowns"),(0,o.yg)("p",null,"Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute. "),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"SDK",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nmodel_list = [{...}]\n\nrouter = Router(model_list=model_list, \n                allowed_fails=1,      # cooldown model if it fails > 1 call in a minute. \n                cooldown_time=100    # cooldown the deployment for 100 seconds if it num_fails > allowed_fails\n        )\n\nuser_message = "Hello, whats the weather in San Francisco??"\nmessages = [{"content": user_message, "role": "user"}]\n\n# normal call \nresponse = router.completion(model="gpt-3.5-turbo", messages=messages)\n\nprint(f"response: {response}")\n'))),(0,o.yg)(r.A,{value:"proxy",label:"PROXY",mdxType:"TabItem"},(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Set Global Value")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"router_settings:\n    allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. \n    cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails\n")),(0,o.yg)("p",null,"Defaults:"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"allowed_fails: 3"),(0,o.yg)("li",{parentName:"ul"},"cooldown_time: 5s (",(0,o.yg)("inlineCode",{parentName:"li"},"DEFAULT_COOLDOWN_TIME_SECONDS")," in constants.py)")),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Set Per Model")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"model_list:\n- model_name: fake-openai-endpoint\n  litellm_params:\n    model: predibase/llama-3-8b-instruct\n    api_key: os.environ/PREDIBASE_API_KEY\n    tenant_id: os.environ/PREDIBASE_TENANT_ID\n    max_new_tokens: 256\n    cooldown_time: 0 # \ud83d\udc48 KEY CHANGE\n")))),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Expected Response")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"No deployments available for selected model, Try again in 60 seconds. Passed model=claude-3-5-sonnet. pre-call-checks=False, allowed_model_region=n/a.\n")),(0,o.yg)("h4",{id:"disable-cooldowns"},(0,o.yg)("strong",{parentName:"h4"},"Disable cooldowns")),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"SDK",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"from litellm import Router \n\n\nrouter = Router(..., disable_cooldowns=True)\n"))),(0,o.yg)(r.A,{value:"proxy",label:"PROXY",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},"router_settings:\n    disable_cooldowns: True\n")))),(0,o.yg)("h3",{id:"retries"},"Retries"),(0,o.yg)("p",null,"For both async + sync functions, we support retrying failed requests. "),(0,o.yg)("p",null,"For RateLimitError we implement exponential backoffs "),(0,o.yg)("p",null,"For generic errors, we retry immediately "),(0,o.yg)("p",null,"Here's a quick look at how we can set ",(0,o.yg)("inlineCode",{parentName:"p"},"num_retries = 3"),": "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nmodel_list = [{...}]\n\nrouter = Router(model_list=model_list,  \n                num_retries=3)\n\nuser_message = "Hello, whats the weather in San Francisco??"\nmessages = [{"content": user_message, "role": "user"}]\n\n# normal call \nresponse = router.completion(model="gpt-3.5-turbo", messages=messages)\n\nprint(f"response: {response}")\n')),(0,o.yg)("p",null,"We also support setting minimum time to wait before retrying a failed request. This is via the ",(0,o.yg)("inlineCode",{parentName:"p"},"retry_after")," param. "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nmodel_list = [{...}]\n\nrouter = Router(model_list=model_list,  \n                num_retries=3, retry_after=5) # waits min 5s before retrying request\n\nuser_message = "Hello, whats the weather in San Francisco??"\nmessages = [{"content": user_message, "role": "user"}]\n\n# normal call \nresponse = router.completion(model="gpt-3.5-turbo", messages=messages)\n\nprint(f"response: {response}")\n')),(0,o.yg)("h3",{id:"advanced-custom-retries-cooldowns-based-on-error-type"},"[Advanced]",": Custom Retries, Cooldowns based on Error Type"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"Use ",(0,o.yg)("inlineCode",{parentName:"li"},"RetryPolicy")," if you want to set a ",(0,o.yg)("inlineCode",{parentName:"li"},"num_retries")," based on the Exception received"),(0,o.yg)("li",{parentName:"ul"},"Use ",(0,o.yg)("inlineCode",{parentName:"li"},"AllowedFailsPolicy")," to set a custom number of ",(0,o.yg)("inlineCode",{parentName:"li"},"allowed_fails"),"/minute before cooling down a deployment")),(0,o.yg)("p",null,(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/ccda616f2f881375d4e8586c76fe4662909a7d22/litellm/types/router.py#L436"},(0,o.yg)("strong",{parentName:"a"},"See All Exception Types"))),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"SDK",mdxType:"TabItem"},(0,o.yg)("p",null,"Example:"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"retry_policy = RetryPolicy(\n    ContentPolicyViolationErrorRetries=3,         # run 3 retries for ContentPolicyViolationErrors\n    AuthenticationErrorRetries=0,                 # run 0 retries for AuthenticationErrorRetries\n)\n\nallowed_fails_policy = AllowedFailsPolicy(\n    ContentPolicyViolationErrorAllowedFails=1000, # Allow 1000 ContentPolicyViolationError before cooling down a deployment\n    RateLimitErrorAllowedFails=100,               # Allow 100 RateLimitErrors before cooling down a deployment\n)\n")),(0,o.yg)("p",null,"Example Usage"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm.router import RetryPolicy, AllowedFailsPolicy\n\nretry_policy = RetryPolicy(\n    ContentPolicyViolationErrorRetries=3,         # run 3 retries for ContentPolicyViolationErrors\n    AuthenticationErrorRetries=0,                 # run 0 retries for AuthenticationErrorRetries\n    BadRequestErrorRetries=1,\n    TimeoutErrorRetries=2,\n    RateLimitErrorRetries=3,\n)\n\nallowed_fails_policy = AllowedFailsPolicy(\n    ContentPolicyViolationErrorAllowedFails=1000, # Allow 1000 ContentPolicyViolationError before cooling down a deployment\n    RateLimitErrorAllowedFails=100,               # Allow 100 RateLimitErrors before cooling down a deployment\n)\n\nrouter = litellm.Router(\n    model_list=[\n        {\n            "model_name": "gpt-3.5-turbo",  # openai model name\n            "litellm_params": {  # params for litellm completion/embedding call\n                "model": "azure/chatgpt-v-2",\n                "api_key": os.getenv("AZURE_API_KEY"),\n                "api_version": os.getenv("AZURE_API_VERSION"),\n                "api_base": os.getenv("AZURE_API_BASE"),\n            },\n        },\n        {\n            "model_name": "bad-model",  # openai model name\n            "litellm_params": {  # params for litellm completion/embedding call\n                "model": "azure/chatgpt-v-2",\n                "api_key": "bad-key",\n                "api_version": os.getenv("AZURE_API_VERSION"),\n                "api_base": os.getenv("AZURE_API_BASE"),\n            },\n        },\n    ],\n    retry_policy=retry_policy,\n    allowed_fails_policy=allowed_fails_policy,\n)\n\nresponse = await router.acompletion(\n    model=model,\n    messages=messages,\n)\n'))),(0,o.yg)(r.A,{value:"proxy",label:"PROXY",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-yaml"},'router_settings: \n  retry_policy: {\n    "BadRequestErrorRetries": 3,\n    "ContentPolicyViolationErrorRetries": 4\n  }\n  allowed_fails_policy: {\n    "ContentPolicyViolationErrorAllowedFails": 1000, # Allow 1000 ContentPolicyViolationError before cooling down a deployment\n    "RateLimitErrorAllowedFails": 100 # Allow 100 RateLimitErrors before cooling down a deployment\n  }\n')))),(0,o.yg)("h3",{id:"caching"},"Caching"),(0,o.yg)("p",null,"In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. "),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"In-memory Cache")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"router = Router(model_list=model_list, \n                cache_responses=True)\n\nprint(response)\n")),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Redis Cache")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'router = Router(model_list=model_list, \n                redis_host=os.getenv("REDIS_HOST"), \n                redis_password=os.getenv("REDIS_PASSWORD"), \n                redis_port=os.getenv("REDIS_PORT"),\n                cache_responses=True)\n\nprint(response)\n')),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Pass in Redis URL, additional kwargs")," "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'router = Router(model_list: Optional[list] = None,\n                 ## CACHING ## \n                 redis_url=os.getenv("REDIS_URL")",\n                 cache_kwargs= {}, # additional kwargs to pass to RedisCache (see caching.py)\n                 cache_responses=True)\n')),(0,o.yg)("h2",{id:"pre-call-checks-context-window-eu-regions"},"Pre-Call Checks (Context Window, EU-Regions)"),(0,o.yg)("p",null,"Enable pre-call checks to filter out:"),(0,o.yg)("ol",null,(0,o.yg)("li",{parentName:"ol"},"deployments with context window limit < messages for a call."),(0,o.yg)("li",{parentName:"ol"},"deployments outside of eu-region")),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"sdk",label:"SDK",mdxType:"TabItem"},(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"1. Enable pre-call checks")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"from litellm import Router \n# ...\nrouter = Router(model_list=model_list, enable_pre_call_checks=True) # \ud83d\udc48 Set to True\n")),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"2. Set Model List")),(0,o.yg)("p",null,"For context window checks on azure deployments, set the base model. Pick the base model from ",(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"},"this list"),", all the azure models start with ",(0,o.yg)("inlineCode",{parentName:"p"},"azure/"),". "),(0,o.yg)("p",null,"For 'eu-region' filtering, Set 'region_name' of deployment. "),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Note:")," We automatically infer region_name for Vertex AI, Bedrock, and IBM WatsonxAI based on your litellm params. For Azure, set ",(0,o.yg)("inlineCode",{parentName:"p"},"litellm.enable_preview = True"),"."),(0,o.yg)("p",null,(0,o.yg)("a",{parentName:"p",href:"https://github.com/BerriAI/litellm/blob/d33e49411d6503cb634f9652873160cd534dec96/litellm/router.py#L2958"},(0,o.yg)("strong",{parentName:"a"},"See Code"))),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'model_list = [\n            {\n                "model_name": "gpt-3.5-turbo", # model group name\n                "litellm_params": {  # params for litellm completion/embedding call\n                    "model": "azure/chatgpt-v-2",\n                    "api_key": os.getenv("AZURE_API_KEY"),\n                    "api_version": os.getenv("AZURE_API_VERSION"),\n                    "api_base": os.getenv("AZURE_API_BASE"),\n                    "region_name": "eu" # \ud83d\udc48 SET \'EU\' REGION NAME\n                    "base_model": "azure/gpt-35-turbo", # \ud83d\udc48 (Azure-only) SET BASE MODEL\n                },\n            },\n            {\n                "model_name": "gpt-3.5-turbo", # model group name\n                "litellm_params": {  # params for litellm completion/embedding call\n                    "model": "gpt-3.5-turbo-1106",\n                    "api_key": os.getenv("OPENAI_API_KEY"),\n                },\n            },\n            {\n                "model_name": "gemini-pro",\n                "litellm_params: {\n                    "model": "vertex_ai/gemini-pro-1.5", \n                    "vertex_project": "adroit-crow-1234",\n                    "vertex_location": "us-east1" # \ud83d\udc48 AUTOMATICALLY INFERS \'region_name\'\n                }\n            }\n        ]\n\nrouter = Router(model_list=model_list, enable_pre_call_checks=True) \n')),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"3. Test it!")),(0,o.yg)(a.A,{mdxType:"Tabs"},(0,o.yg)(r.A,{value:"context-window-check",label:"Context Window Check",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'"""\n- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)\n- Send a 5k prompt\n- Assert it works\n"""\nfrom litellm import Router\nimport os\n\nmodel_list = [\n    {\n        "model_name": "gpt-3.5-turbo",  # model group name\n        "litellm_params": {  # params for litellm completion/embedding call\n            "model": "azure/chatgpt-v-2",\n            "api_key": os.getenv("AZURE_API_KEY"),\n            "api_version": os.getenv("AZURE_API_VERSION"),\n            "api_base": os.getenv("AZURE_API_BASE"),\n            "base_model": "azure/gpt-35-turbo",\n        },\n        "model_info": {\n            "base_model": "azure/gpt-35-turbo", \n        }\n    },\n    {\n        "model_name": "gpt-3.5-turbo",  # model group name\n        "litellm_params": {  # params for litellm completion/embedding call\n            "model": "gpt-3.5-turbo-1106",\n            "api_key": os.getenv("OPENAI_API_KEY"),\n        },\n    },\n]\n\nrouter = Router(model_list=model_list, enable_pre_call_checks=True) \n\ntext = "What is the meaning of 42?" * 5000\n\nresponse = router.completion(\n    model="gpt-3.5-turbo",\n    messages=[\n        {"role": "system", "content": text},\n        {"role": "user", "content": "Who was Alexander?"},\n    ],\n)\n\nprint(f"response: {response}")\n'))),(0,o.yg)(r.A,{value:"eu-region-check",label:"EU Region Check",mdxType:"TabItem"},(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'"""\n- Give 2 gpt-3.5-turbo deployments, in eu + non-eu regions\n- Make a call\n- Assert it picks the eu-region model\n"""\n\nfrom litellm import Router\nimport os\n\nmodel_list = [\n    {\n        "model_name": "gpt-3.5-turbo",  # model group name\n        "litellm_params": {  # params for litellm completion/embedding call\n            "model": "azure/chatgpt-v-2",\n            "api_key": os.getenv("AZURE_API_KEY"),\n            "api_version": os.getenv("AZURE_API_VERSION"),\n            "api_base": os.getenv("AZURE_API_BASE"),\n            "region_name": "eu"\n        },\n        "model_info": {\n            "id": "1"\n        }\n    },\n    {\n        "model_name": "gpt-3.5-turbo",  # model group name\n        "litellm_params": {  # params for litellm completion/embedding call\n            "model": "gpt-3.5-turbo-1106",\n            "api_key": os.getenv("OPENAI_API_KEY"),\n        },\n        "model_info": {\n            "id": "2"\n        }\n    },\n]\n\nrouter = Router(model_list=model_list, enable_pre_call_checks=True) \n\nresponse = router.completion(\n    model="gpt-3.5-turbo",\n    messages=[{"role": "user", "content": "Who was Alexander?"}],\n)\n\nprint(f"response: {response}")\n\nprint(f"response id: {response._hidden_params[\'model_id\']}")\n'))))),(0,o.yg)(r.A,{value:"proxy",label:"Proxy",mdxType:"TabItem"},(0,o.yg)("admonition",{type:"info"},(0,o.yg)("p",{parentName:"admonition"},"Go ",(0,o.yg)("a",{parentName:"p",href:"/docs/proxy/reliability#advanced---context-window-fallbacks"},"here")," for how to do this on the proxy")))),(0,o.yg)("h2",{id:"caching-across-model-groups"},"Caching across model groups"),(0,o.yg)("p",null,"If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups. "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'import litellm, asyncio, time\nfrom litellm import Router \n\n# set os env\nos.environ["OPENAI_API_KEY"] = ""\nos.environ["AZURE_API_KEY"] = ""\nos.environ["AZURE_API_BASE"] = ""\nos.environ["AZURE_API_VERSION"] = ""\n\nasync def test_acompletion_caching_on_router_caching_groups(): \n    # tests acompletion + caching on router \n    try:\n        litellm.set_verbose = True\n        model_list = [\n            {\n                "model_name": "openai-gpt-3.5-turbo",\n                "litellm_params": {\n                    "model": "gpt-3.5-turbo-0613",\n                    "api_key": os.getenv("OPENAI_API_KEY"),\n                },\n            },\n            {\n                "model_name": "azure-gpt-3.5-turbo",\n                "litellm_params": {\n                    "model": "azure/chatgpt-v-2",\n                    "api_key": os.getenv("AZURE_API_KEY"),\n                    "api_base": os.getenv("AZURE_API_BASE"),\n                    "api_version": os.getenv("AZURE_API_VERSION")\n                },\n            }\n        ]\n\n        messages = [\n            {"role": "user", "content": f"write a one sentence poem {time.time()}?"}\n        ]\n        start_time = time.time()\n        router = Router(model_list=model_list, \n                cache_responses=True, \n                caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")])\n        response1 = await router.acompletion(model="openai-gpt-3.5-turbo", messages=messages, temperature=1)\n        print(f"response1: {response1}")\n        await asyncio.sleep(1) # add cache is async, async sleep for cache to get set\n        response2 = await router.acompletion(model="azure-gpt-3.5-turbo", messages=messages, temperature=1)\n        assert response1.id == response2.id\n        assert len(response1.choices[0].message.content) > 0\n        assert response1.choices[0].message.content == response2.choices[0].message.content\n    except Exception as e:\n        traceback.print_exc()\n\nasyncio.run(test_acompletion_caching_on_router_caching_groups())\n')),(0,o.yg)("h2",{id:"alerting-"},"Alerting \ud83d\udea8"),(0,o.yg)("p",null,"Send alerts to slack / your webhook url for the following events"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"LLM API Exceptions"),(0,o.yg)("li",{parentName:"ul"},"Slow LLM Responses")),(0,o.yg)("p",null,"Get a slack webhook url from ",(0,o.yg)("a",{parentName:"p",href:"https://api.slack.com/messaging/webhooks"},"https://api.slack.com/messaging/webhooks")),(0,o.yg)("h4",{id:"usage"},"Usage"),(0,o.yg)("p",null,"Initialize an ",(0,o.yg)("inlineCode",{parentName:"p"},"AlertingConfig")," and pass it to ",(0,o.yg)("inlineCode",{parentName:"p"},"litellm.Router"),". The following code will trigger an alert because ",(0,o.yg)("inlineCode",{parentName:"p"},"api_key=bad-key")," which is invalid"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm.router import AlertingConfig\nimport litellm\nimport os\n\nrouter = litellm.Router(\n    model_list=[\n        {\n            "model_name": "gpt-3.5-turbo",\n            "litellm_params": {\n                "model": "gpt-3.5-turbo",\n                "api_key": "bad_key",\n            },\n        }\n    ],\n    alerting_config= AlertingConfig(\n        alerting_threshold=10,                        # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds\n        webhook_url= os.getenv("SLACK_WEBHOOK_URL")   # webhook you want to send alerts to\n    ),\n)\ntry:\n    await router.acompletion(\n        model="gpt-3.5-turbo",\n        messages=[{"role": "user", "content": "Hey, how\'s it going?"}],\n    )\nexcept:\n    pass\n')),(0,o.yg)("h2",{id:"track-cost-for-azure-deployments"},"Track cost for Azure Deployments"),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Problem"),": Azure returns ",(0,o.yg)("inlineCode",{parentName:"p"},"gpt-4")," in the response when ",(0,o.yg)("inlineCode",{parentName:"p"},"azure/gpt-4-1106-preview")," is used. This leads to inaccurate cost tracking"),(0,o.yg)("p",null,(0,o.yg)("strong",{parentName:"p"},"Solution")," \u2705 :  Set ",(0,o.yg)("inlineCode",{parentName:"p"},'model_info["base_model"]')," on your router init so litellm uses the correct model for calculating azure cost"),(0,o.yg)("p",null,"Step 1. Router Setup"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nmodel_list = [\n    { # list of model deployments \n        "model_name": "gpt-4-preview", # model alias \n        "litellm_params": { # params for litellm completion/embedding call \n            "model": "azure/chatgpt-v-2", # actual model name\n            "api_key": os.getenv("AZURE_API_KEY"),\n            "api_version": os.getenv("AZURE_API_VERSION"),\n            "api_base": os.getenv("AZURE_API_BASE")\n        },\n        "model_info": {\n            "base_model": "azure/gpt-4-1106-preview" # azure/gpt-4-1106-preview will be used for cost tracking, ensure this exists in litellm model_prices_and_context_window.json\n        }\n    }, \n    {\n        "model_name": "gpt-4-32k", \n        "litellm_params": { # params for litellm completion/embedding call \n            "model": "azure/chatgpt-functioncalling", \n            "api_key": os.getenv("AZURE_API_KEY"),\n            "api_version": os.getenv("AZURE_API_VERSION"),\n            "api_base": os.getenv("AZURE_API_BASE")\n        },\n        "model_info": {\n            "base_model": "azure/gpt-4-32k" # azure/gpt-4-32k will be used for cost tracking, ensure this exists in litellm model_prices_and_context_window.json\n        }\n    }\n]\n\nrouter = Router(model_list=model_list)\n\n')),(0,o.yg)("p",null,"Step 2. Access ",(0,o.yg)("inlineCode",{parentName:"p"},"response_cost")," in the custom callback, ",(0,o.yg)("strong",{parentName:"p"},"litellm calculates the response cost for you")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'import litellm\nfrom litellm.integrations.custom_logger import CustomLogger\n\nclass MyCustomHandler(CustomLogger):        \n    def log_success_event(self, kwargs, response_obj, start_time, end_time): \n        print(f"On Success")\n        response_cost = kwargs.get("response_cost")\n        print("response_cost=", response_cost)\n\ncustomHandler = MyCustomHandler()\nlitellm.callbacks = [customHandler]\n\n# router completion call\nresponse = router.completion(\n    model="gpt-4-32k", \n    messages=[{ "role": "user", "content": "Hi who are you"}]\n)\n')),(0,o.yg)("h4",{id:"default-litellmcompletionembedding-params"},"Default litellm.completion/embedding params"),(0,o.yg)("p",null,"You can also set default params for litellm completion/embedding calls. Here's how to do that: "),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nfallback_dict = {"gpt-3.5-turbo": "gpt-3.5-turbo-16k"}\n\nrouter = Router(model_list=model_list, \n                default_litellm_params={"context_window_fallback_dict": fallback_dict})\n\nuser_message = "Hello, whats the weather in San Francisco??"\nmessages = [{"content": user_message, "role": "user"}]\n\n# normal call \nresponse = router.completion(model="gpt-3.5-turbo", messages=messages)\n\nprint(f"response: {response}")\n')),(0,o.yg)("h2",{id:"custom-callbacks---track-api-key-api-endpoint-model-used"},"Custom Callbacks - Track API Key, API Endpoint, Model Used"),(0,o.yg)("p",null,"If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a ",(0,o.yg)("a",{parentName:"p",href:"https://docs.litellm.ai/docs/observability/custom_callback"},"custom callback")," "),(0,o.yg)("h3",{id:"usage-1"},"Usage"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'import litellm\nfrom litellm.integrations.custom_logger import CustomLogger\n\nclass MyCustomHandler(CustomLogger):        \n    def log_success_event(self, kwargs, response_obj, start_time, end_time): \n        print(f"On Success")\n        print("kwargs=", kwargs)\n        litellm_params= kwargs.get("litellm_params")\n        api_key = litellm_params.get("api_key")\n        api_base = litellm_params.get("api_base")\n        custom_llm_provider= litellm_params.get("custom_llm_provider")\n        response_cost = kwargs.get("response_cost")\n\n        # print the values\n        print("api_key=", api_key)\n        print("api_base=", api_base)\n        print("custom_llm_provider=", custom_llm_provider)\n        print("response_cost=", response_cost)\n\n    def log_failure_event(self, kwargs, response_obj, start_time, end_time): \n        print(f"On Failure")\n        print("kwargs=")\n\ncustomHandler = MyCustomHandler()\n\nlitellm.callbacks = [customHandler]\n\n# Init Router\nrouter = Router(model_list=model_list, routing_strategy="simple-shuffle")\n\n# router completion call\nresponse = router.completion(\n    model="gpt-3.5-turbo", \n    messages=[{ "role": "user", "content": "Hi who are you"}]\n)\n')),(0,o.yg)("h2",{id:"deploy-router"},"Deploy Router"),(0,o.yg)("p",null,"If you want a server to load balance across different LLM APIs, use our ",(0,o.yg)("a",{parentName:"p",href:"./simple_proxy#load-balancing---multiple-instances-of-1-model"},"LiteLLM Proxy Server")),(0,o.yg)("h2",{id:"debugging-router"},"Debugging Router"),(0,o.yg)("h3",{id:"basic-debugging"},"Basic Debugging"),(0,o.yg)("p",null,"Set ",(0,o.yg)("inlineCode",{parentName:"p"},"Router(set_verbose=True)")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"from litellm import Router\n\nrouter = Router(\n    model_list=model_list,\n    set_verbose=True\n)\n")),(0,o.yg)("h3",{id:"detailed-debugging"},"Detailed Debugging"),(0,o.yg)("p",null,"Set ",(0,o.yg)("inlineCode",{parentName:"p"},'Router(set_verbose=True,debug_level="DEBUG")')),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\n\nrouter = Router(\n    model_list=model_list,\n    set_verbose=True,\n    debug_level="DEBUG"  # defaults to INFO\n)\n')),(0,o.yg)("h3",{id:"very-detailed-debugging"},"Very Detailed Debugging"),(0,o.yg)("p",null,"Set ",(0,o.yg)("inlineCode",{parentName:"p"},"litellm.set_verbose=True")," and ",(0,o.yg)("inlineCode",{parentName:"p"},'Router(set_verbose=True,debug_level="DEBUG")')),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},'from litellm import Router\nimport litellm\n\nlitellm.set_verbose = True\n\nrouter = Router(\n    model_list=model_list,\n    set_verbose=True,\n    debug_level="DEBUG"  # defaults to INFO\n)\n')),(0,o.yg)("h2",{id:"router-general-settings"},"Router General Settings"),(0,o.yg)("h3",{id:"usage-2"},"Usage"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"router = Router(model_list=..., router_general_settings=RouterGeneralSettings(async_only_mode=True))\n")),(0,o.yg)("h3",{id:"spec"},"Spec"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre",className:"language-python"},"class RouterGeneralSettings(BaseModel):\n    async_only_mode: bool = Field(\n        default=False\n    )  # this will only initialize async clients. Good for memory utils\n    pass_through_all_models: bool = Field(\n        default=False\n    )  # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding\n")))}c.isMDXComponent=!0}}]);