
{"id":8025,"date":"2026-05-06T02:41:28","date_gmt":"2026-05-05T18:41:28","guid":{"rendered":"https:\/\/infernews.com\/blog\/?page_id=8025"},"modified":"2026-05-06T02:42:55","modified_gmt":"2026-05-05T18:42:55","slug":"q-learning","status":"publish","type":"page","link":"https:\/\/infernews.com\/blog\/q-learning\/","title":{"rendered":"Q\u2011Learning"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">Q\u2011Learning \u662f\u4e00\u7a2e\u300c\u6a21\u578b\u7121\u95dc\u300d\uff08model\u2011free\uff09\u7684\u5f37\u5316\u5b78\u7fd2\u6f14\u7b97\u6cd5\uff0c\u7528\u4f86\u5728\u4e0d\u77e5\u9053\u74b0\u5883\u8f49\u79fb\u6a5f\u7387\u7684\u60c5\u6cc1\u4e0b\uff0c\u900f\u904e\u53cd\u8986\u8a66\u8aa4\u5b78\u51fa\u5728\u6bcf\u500b\u72c0\u614b\u4e0b\u61c9\u8a72\u63a1\u53d6\u54ea\u500b\u52d5\u4f5c\u53ef\u4ee5\u8b93\u9577\u671f\u7d2f\u7a4d\u56de\u5831\u6700\u5927\u5316\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u6838\u5fc3\u6982\u5ff5<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5728\u5f37\u5316\u5b78\u7fd2\u88e1\uff0c\u6211\u5011\u6709\u300c\u72c0\u614b\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>s<\/mi><\/mrow><\/semantics><\/math><em>s<\/em>\u300d\u3001\u300c\u52d5\u4f5c\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>a<\/mi><\/mrow><\/semantics><\/math><em>a<\/em>\u300d\u3001\u300c\u56de\u994b\uff0f\u734e\u52f5\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>r<\/mi><\/mrow><\/semantics><\/math><em>r<\/em>\u300d\u3001\u300c\u6298\u6263\u56e0\u5b50\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>\u03b3<\/mi><\/mrow><\/semantics><\/math><em>\u03b3<\/em>\u300d\uff0c\u4ee5\u53ca agent \u8207\u74b0\u5883\u53cd\u8986\u4e92\u52d5\u7684\u904e\u7a0b\u3002<\/li>\n\n\n\n<li>Q\u2011Learning \u4e0d\u9700\u8981\u74b0\u5883\u6a21\u578b\uff08\u4f8b\u5982\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>P<\/mi><mo stretchy=\"false\">(<\/mo><msup><mi>s<\/mi><mo mathvariant=\"normal\" lspace=\"0em\" rspace=\"0em\">\u2032<\/mo><\/msup><mo>\u2223<\/mo><mi>s<\/mi><mo separator=\"true\">,<\/mo><mi>a<\/mi><mo stretchy=\"false\">)<\/mo><\/mrow><\/semantics><\/math><em>P<\/em>(<em>s<\/em>\u2032\u2223<em>s<\/em>,<em>a<\/em>)\uff09\uff0c\u800c\u662f\u76f4\u63a5\u5b78\u4e00\u500b action\u2011value \u51fd\u6578\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>Q<\/mi><mo stretchy=\"false\">(<\/mo><mi>s<\/mi><mo separator=\"true\">,<\/mo><mi>a<\/mi><mo stretchy=\"false\">)<\/mo><\/mrow><\/semantics><\/math><em>Q<\/em>(<em>s<\/em>,<em>a<\/em>)\uff1a\u4ee3\u8868\u5728\u72c0\u614b\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>s<\/mi><\/mrow><\/semantics><\/math><em>s<\/em>\u00a0\u4e0b\u63a1\u53d6\u52d5\u4f5c\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>a<\/mi><\/mrow><\/semantics><\/math><em>a<\/em>\uff0c\u4e4b\u5f8c\u6309\u7167\u6700\u512a\u7b56\u7565\u884c\u52d5\u6642\u7684\u671f\u671b\u7d2f\u7a4d\u56de\u5831\u3002<\/li>\n\n\n\n<li>\u901a\u5e38\u7528\u4e00\u500b Q\u2011table\uff08\u6216\u5728\u9023\u7e8c\u7a7a\u9593\u6642\u7528 NN \u8fd1\u4f3c\uff09\u4f86\u5b58\u9019\u4e9b\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>Q<\/mi><mo stretchy=\"false\">(<\/mo><mi>s<\/mi><mo separator=\"true\">,<\/mo><mi>a<\/mi><mo stretchy=\"false\">)<\/mo><\/mrow><\/semantics><\/math><em>Q<\/em>(<em>s<\/em>,<em>a<\/em>)\u00a0\u503c\uff0cagent \u5728\u6c7a\u7b56\u6642\u5c31\u67e5\u8868\u9078\u64c7 Q \u503c\u6700\u5927\u7684\u52d5\u4f5c\uff08\u6216\u505a \u03b5\u2011greedy \u63a2\u7d22\uff09\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"bellman\">\u66f4\u65b0\u898f\u5247\uff08Bellman \u65b9\u7a0b\uff09<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u6bcf\u6b21\u4e92\u52d5\u4e00\u6b65&nbsp;<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>s<\/mi><mo separator=\"true\">,<\/mo><mi>a<\/mi><mo separator=\"true\">,<\/mo><mi>r<\/mi><mo separator=\"true\">,<\/mo><msup><mi>s<\/mi><mo mathvariant=\"normal\" lspace=\"0em\" rspace=\"0em\">\u2032<\/mo><\/msup><\/mrow><\/semantics><\/math><em>s<\/em>,<em>a<\/em>,<em>r<\/em>,<em>s<\/em>\u2032&nbsp;\u5f8c\uff0cQ\u2011Learning \u7528 Bellman optimality \u7684 TD \u66f4\u65b0\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u76f4\u89c0\u7248\u53e3\u982d\u63cf\u8ff0\uff1a\n<ul class=\"wp-block-list\">\n<li>\u53d6\u76ee\u524d\u7684\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>Q<\/mi><mo stretchy=\"false\">(<\/mo><mi>s<\/mi><mo separator=\"true\">,<\/mo><mi>a<\/mi><mo stretchy=\"false\">)<\/mo><\/mrow><\/semantics><\/math><em>Q<\/em>(<em>s<\/em>,<em>a<\/em>)\u3002<\/li>\n\n\n\n<li>\u7528\u300c\u7acb\u5373\u734e\u52f5\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>r<\/mi><\/mrow><\/semantics><\/math><em>r<\/em>\u00a0+ \u6298\u6263\u5f8c\u7684\u4e0b\u4e00\u72c0\u614b\u6700\u5927 Q \u503c\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>\u03b3<\/mi><msub><mrow><mi>max<\/mi><mo>\u2061<\/mo><\/mrow><msup><mi>a<\/mi><mo mathvariant=\"normal\" lspace=\"0em\" rspace=\"0em\">\u2032<\/mo><\/msup><\/msub><mi>Q<\/mi><mo stretchy=\"false\">(<\/mo><msup><mi>s<\/mi><mo mathvariant=\"normal\" lspace=\"0em\" rspace=\"0em\">\u2032<\/mo><\/msup><mo separator=\"true\">,<\/mo><msup><mi>a<\/mi><mo mathvariant=\"normal\" lspace=\"0em\" rspace=\"0em\">\u2032<\/mo><\/msup><mo stretchy=\"false\">)<\/mo><\/mrow><\/semantics><\/math><em>\u03b3<\/em>max<em>a<\/em>\u2032\u200b<em>Q<\/em>(<em>s<\/em>\u2032,<em>a<\/em>\u2032)\u300d\u7576\u6210\u65b0\u7684\u76ee\u6a19\u3002<\/li>\n\n\n\n<li>\u5169\u8005\u5dee\u53eb\u505a TD error\uff0c\u4e58\u4e0a\u5b78\u7fd2\u7387\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>\u03b1<\/mi><\/mrow><\/semantics><\/math><em>\u03b1<\/em>\u00a0\u52a0\u56de\u53bb\uff0c\u66f4\u65b0\u00a0<math xmlns=\"http:\/\/www.w3.org\/1998\/Math\/MathML\"><semantics><mrow><mi>Q<\/mi><mo stretchy=\"false\">(<\/mo><mi>s<\/mi><mo separator=\"true\">,<\/mo><mi>a<\/mi><mo stretchy=\"false\">)<\/mo><\/mrow><\/semantics><\/math><em>Q<\/em>(<em>s<\/em>,<em>a<\/em>)\u3002<\/li>\n<\/ul>\n<\/li>\n\n\n\n<li>\u53cd\u8986\u5957\u7528\u9019\u500b\u66f4\u65b0\uff0c\u5728\u5408\u9069\u7684\u63a2\u7d22\u7b56\u7565\u8207\u5b78\u7fd2\u7387\u689d\u4ef6\u4e0b\uff0cQ\u2011Learning \u53ef\u4ee5\u6536\u6582\u5230\u6700\u512a Q \u51fd\u6578\uff0c\u56e0\u6b64\u4e5f\u9593\u63a5\u5b78\u5230\u6700\u512a\u7b56\u7565\uff08\u5c0d\u6bcf\u500b\u72c0\u614b\u9078 Q \u6700\u5927\u7684\u52d5\u4f5c\uff09\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">\u7279\u9ede\u8207\u61c9\u7528\u76f4\u89ba<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Off\u2011policy\uff1a\u66f4\u65b0\u6642\u662f\u770b\u300c\u4e0b\u4e00\u6b65\u53ef\u80fd\u7684\u6700\u4f73\u52d5\u4f5c\u300d\u7684\u56de\u5831\uff0c\u4e0d\u4e00\u5b9a\u662f\u5be6\u969b\u8d70\u7684\u52d5\u4f5c\uff0c\u56e0\u6b64\u5373\u4f7f\u7528\u96a8\u6a5f\u6216 exploratory policy \u63a2\u7d22\uff0c\u4e5f\u80fd\u5b78\u5230\u6700\u512a\u7b56\u7565\u3002<\/li>\n\n\n\n<li>\u512a\u9ede\uff1a\u5be6\u4f5c\u7c21\u55ae\u3001\u4e0d\u9700\u8981\u74b0\u5883\u6a21\u578b\uff0c\u9069\u7528\u65bc\u96e2\u6563\u72c0\u614b\uff0f\u52d5\u4f5c\u7a7a\u9593\u7684\u5c0f\u5230\u4e2d\u578b\u554f\u984c\uff0c\u4f8b\u5982\u7db2\u683c\u4e16\u754c\u3001\u7c21\u55ae\u904a\u6232\u3001routing \u63a7\u5236\u7b49\u3002<\/li>\n\n\n\n<li>\u5ef6\u4f38\uff1a\u7576\u72c0\u614b\u7a7a\u9593\u592a\u5927\u6642\uff0c\u5c31\u6703\u767c\u5c55\u6210 Deep Q\u2011Learning\uff08DQN\uff09\uff0c\u7528\u6df1\u5ea6\u7db2\u8def\u8fd1\u4f3c Q \u51fd\u6578\uff0c\u800c\u4e0d\u662f\u660e\u78ba\u5b58\u6574\u5f35 Q\u2011table\u3002<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u679c\u4f60\u60f3\uff0c\u6211\u53ef\u4ee5\u4e0b\u4e00\u6b65\u76f4\u63a5\u7d66\u4f60\u4e00\u500b\u8d85\u77ed\u7684 Python \u5be6\u4f5c\u7bc4\u4f8b\uff08tabular Q\u2011learning\uff09\u7576\u4f5c template\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u7531\u6df1\u5ea6\u7814\u7a76\u6e96\u5099<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e00\u500b\u8d85\u77ed\u7684 Python \u5be6\u4f5c\u7bc4\u4f8b\uff08tabular Q\u2011learning\uff09\u7576\u4f5c template\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e0b\u9762\u662f\u4e00\u500b\u300c\u6700\u5c0f\u53ef\u7528\u300d\u7684 tabular Q\u2011learning Template\uff0c\u7528 Gymnasium \u7684 FrozenLake\uff08\u96e2\u6563\u72c0\u614b\uff0f\u52d5\u4f5c\uff09\u793a\u7bc4\uff0c\u4f60\u53ef\u4ee5\u76f4\u63a5\u6539\u6210\u4efb\u4f55\u96e2\u6563\u74b0\u5883\u3002<\/p>\n\n\n\n<div class=\"codecopy-container\">\n                <div class=\"codecopy-header\">\n                    \n                    <button class=\"codecopy-btn\" data-target=\"codecopy-6a347ba03f795\" title=\"Copy code to clipboard\">\n                <svg width=\"16\" height=\"16\" viewBox=\"0 0 24 24\" fill=\"none\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\">\n                    <path d=\"M16 1H4C2.9 1 2 1.9 2 3V17H4V3H16V1ZM19 5H8C6.9 5 6 5.9 6 7V21C6 22.1 6.9 23 8 23H19C20.1 23 21 22.1 21 21V7C21 5.9 20.1 5 19 5ZM19 21H8V7H19V21Z\" fill=\"currentColor\"\/>\n                <\/svg>\n                <span class=\"codecopy-text\">Copy<\/span>\n            <\/button>\n                <\/div>\n                <pre class=\"line-numbers\" id=\"codecopy-6a347ba03f795\"><code class=\"language-text\">import numpy as np\nimport gymnasium as gym\n\nenv = gym.make(\"FrozenLake-v1\", is_slippery=False)  # \u96e2\u6563 state\/action \u74b0\u5883\nn_states = env.observation_space.n\nn_actions = env.action_space.n\n\nQ = np.zeros((n_states, n_actions))  # Q-table\n\nalpha = 0.1      # learning rate\ngamma = 0.99     # discount factor\nepsilon = 0.1    # \u03b5-greedy\nn_episodes = 2000\nmax_steps = 100\n\ndef epsilon_greedy_action(state):\n    if np.random.rand() &lt; epsilon:\n        return env.action_space.sample()          # explore\n    return np.argmax(Q&#91;state])                    # exploit\n\nfor ep in range(n_episodes):\n    state, _ = env.reset()\n    for t in range(max_steps):\n        action = epsilon_greedy_action(state)\n        next_state, reward, terminated, truncated, _ = env.step(action)\n        done = terminated or truncated\n\n        # Q-learning \u66f4\u65b0\n        best_next = np.max(Q&#91;next_state])\n        td_target = reward + gamma * best_next\n        td_error = td_target - Q&#91;state, action]\n        Q&#91;state, action] += alpha * td_error\n\n        state = next_state\n        if done:\n            break\n\n# \u6e2c\u8a66\uff1a\u7528\u5b78\u5230\u7684 Q-table \u73a9\u4e00\u56de\nstate, _ = env.reset()\ndone = False\nwhile not done:\n    action = np.argmax(Q&#91;state])\n    next_state, reward, terminated, truncated, _ = env.step(action)\n    state = next_state\n    done = terminated or truncated\n    env.render()\nprint(\"Episode reward:\", reward)<\/code><\/pre>\n            <\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u679c\u4f60\u8981\u6539\u6210\u81ea\u5df1\u7684\u96e2\u6563\u74b0\u5883\uff0c\u53ea\u8981\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u628a\u00a0<code>env = gym.make(...)<\/code>\u00a0\u63db\u6210\u4f60\u7684\u74b0\u5883\u3002<\/li>\n\n\n\n<li>\u78ba\u4fdd\u6709\u00a0<code>observation_space.n<\/code>\u00a0\/\u00a0<code>action_space.n<\/code>\uff08\u6216\u81ea\u5df1\u5b9a\u7fa9\u96e2\u6563\u7de8\u78bc\uff09\uff0c\u5176\u9918\u6838\u5fc3 Q\u2011learning loop \u57fa\u672c\u53ef\u4ee5\u539f\u6a23\u6cbf\u7528\u3002<\/li>\n<\/ul>\n","protected":false},"excerpt":{"rendered":"<p>Q\u2011Learning \u662f\u4e00\u7a2e\u300c\u6a21\u578b\u7121\u95dc\u300d\uff08model\u2011free\uff09\u7684\u5f37\u5316\u5b78\u7fd2\u6f14\u7b97\u6cd5\uff0c\u7528\u4f86\u5728\u4e0d\u77e5\u9053\u74b0\u5883\u8f49\u79fb\u6a5f\u7387\u7684\u60c5\u6cc1\u4e0b\uff0c\u900f\u904e\u53cd\u8986\u8a66\u8aa4\u5b78\u51fa\u5728\u6bcf\u500b\u72c0\u614b\u4e0b\u61c9\u8a72\u63a1\u53d6\u54ea\u500b\u52d5\u4f5c\u53ef\u4ee5\u8b93\u9577\u671f\u7d2f\u7a4d\u56de\u5831\u6700\u5927\u5316\u3002 \u6838\u5fc3\u6982\u5ff5 \u66f4\u65b0\u898f\u5247\uff08Bellman \u65b9\u7a0b\uff09 \u6bcf\u6b21\u4e92\u52d5\u4e00\u6b65&nbsp;s,a,r,s\u2032s,a,r,s\u2032&nbsp;\u5f8c\uff0cQ\u2011Learning \u7528 Bellman optimality \u7684 TD \u66f4\u65b0\uff1a \u7279\u9ede\u8207\u61c9\u7528\u76f4\u89ba \u5982\u679c\u4f60\u60f3\uff0c\u6211\u53ef\u4ee5\u4e0b\u4e00\u6b65\u76f4\u63a5\u7d66\u4f60\u4e00\u500b\u8d85\u77ed\u7684 Python \u5be6\u4f5c\u7bc4\u4f8b\uff08tabular Q\u2011learning\uff09\u7576\u4f5c template\u3002 \u7531\u6df1\u5ea6\u7814\u7a76\u6e96\u5099 \u4e00\u500b\u8d85\u77ed\u7684 Python \u5be6\u4f5c\u7bc4\u4f8b\uff08tabular Q\u2011learning\uff09\u7576\u4f5c template\u3002 \u4e0b\u9762\u662f\u4e00\u500b\u300c\u6700\u5c0f\u53ef\u7528\u300d\u7684 tabular Q\u2011learning Template\uff0c\u7528 Gymnasium \u7684 FrozenLake\uff08\u96e2\u6563\u72c0\u614b\uff0f\u52d5\u4f5c\uff09\u793a\u7bc4\uff0c\u4f60\u53ef\u4ee5\u76f4\u63a5\u6539\u6210\u4efb\u4f55\u96e2\u6563\u74b0\u5883\u3002 \u5982\u679c\u4f60\u8981\u6539\u6210\u81ea\u5df1\u7684\u96e2\u6563\u74b0\u5883\uff0c\u53ea\u8981\uff1a<\/p>\n","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"ai_generated_summary":"","footnotes":""},"class_list":["post-8025","page","type-page","status-publish","hentry"],"_links":{"self":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/pages\/8025","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/comments?post=8025"}],"version-history":[{"count":0,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/pages\/8025\/revisions"}],"wp:attachment":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/media?parent=8025"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}