{"id":4451,"date":"2023-07-30T22:11:27","date_gmt":"2023-07-30T14:11:27","guid":{"rendered":"\/?p=4451"},"modified":"2023-07-31T20:14:18","modified_gmt":"2023-07-31T12:14:18","slug":"15-spark%e4%b8%ad%e7%9a%84join%e7%ae%97%e5%ad%90","status":"publish","type":"post","link":"http:\/\/xinblog.ltd\/?p=4451","title":{"rendered":"15.Spark\u4e2d\u7684join\u7b97\u5b50"},"content":{"rendered":"<p>\u6211\u4eec\u5b66\u4e60\u4e86Spark SQL\u4e2d\u7684\u8bf8\u591a\u7b97\u5b50,\u4f46\u5728\u8fd9\u4e00\u8bb2\u4e2d\uff0c\u6211\u4eec\u5c06\u805a\u7126\u4e8e\u5176\u4e2d\u5173\u4e8ejoin\u7684\u652f\u6301<\/p>\n<p>Join\u7684\u79cd\u7c7b\u5f88\u4e30\u5bcc\uff0c\u4ece\u79cd\u7c7b\u6765\u5212\u5206\u57fa\u672c\u53ef\u4ee5\u5206\u4e3a\u5185\u5173\u8054\uff0c\u5916\u5173\u8054\uff0c\u5de6\u5173\u8054\uff0c\u53f3\u5173\u8054<\/p>\n<p>\u4ece\u5b9e\u73b0\u673a\u5236\u7684\u89d2\u5ea6\uff0cJoin\u53ef\u4ee5\u5206\u4e3aNLJ,SMJ \u548c HJ<\/p>\n<p>Nested Loop join, Sort Merge Join,Hash Join<\/p>\n<p>\u9996\u5148\u6211\u4eec\u51c6\u5907\u4e00\u4e9b\u6570\u636e<\/p>\n<table>\n<tr>\n<td>\n  import spark.implicits._<\/p>\n<p>import org.apache.spark.sql.DataFrame<\/p>\n<p>\/\/ \u521b\u5efa\u5458\u5de5\u4fe1\u606f\u8868<\/p>\n<p>val seq = Seq((1, &#8220;Mike&#8221;, 28,   &#8220;Male&#8221;), (2, &#8220;Lily&#8221;, 30, &#8220;Female&#8221;), (3,   &#8220;Raymond&#8221;,<\/p>\n<p>val employees: DataFrame =   seq.toDF(&#8220;id&#8221;, &#8220;name&#8221;, &#8220;age&#8221;,   &#8220;gender&#8221;)<\/p>\n<p>\/\/ \u521b\u5efa\u85aa\u8d44\u8868<\/p>\n<p>val seq2 = Seq((1, 26000), (2, 30000),   (4, 25000), (3, 20000))<\/p>\n<p>val salaries:DataFrame =   seq2.toDF(&#8220;id&#8221;, &#8220;salary&#8221;)<\/td>\n<\/tr>\n<\/table>\n<p>\u4e0a\u9762\u4ee3\u7801\u4e2d\uff0c\u6211\u4eec\u521b\u5efa\u4e86\u4e24\u4e2aDataFrame\uff0c\u4e00\u4e2a\u7528\u4e8e\u5b58\u50a8\u5458\u5de5\u4fe1\u606f\uff0c\u53e6\u4e00\u4e2a\u5b58\u50a8\u5458\u5de5\u85aa\u8d44<\/p>\n<p>\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u5173\u8054\u6761\u4ef6id\u8fdb\u884c\u62fc\u63a5<\/p>\n<p>\u8fd9\u91cc\u6211\u4eec\u901a\u8fc7\u8fd9\u4e24\u4e2a\u8868\u5c55\u793a\u4e0d\u540c\u7684\u5173\u8054\u5f62\u5f0f<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"617\" height=\"354\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-43.png\" class=\"wp-image-4452\" alt=\"\u56fe\u7247\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-43.png 617w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-43-300x172.png 300w\" sizes=\"(max-width: 617px) 100vw, 617px\" \/><\/p>\n<p>\u9996\u5148\u662f\u5185\u5173\u8054\uff0c\u6211\u4eec\u5982\u679c\u60f3\u8981\u83b7\u53d6\u6bcf\u4e2a\u4eba\u7684\u85aa\u8d44\u60c5\u51b5\uff0c\u5e76\u4e14\u6dd8\u6c70\u4e24\u4e2a\u8868\u4e2d\u7684\u4e0d\u5b8c\u6574\u6570\u636e\uff0c\u5c31\u53ef\u4ee5\u4f7f\u7528\u5185\u5173\u8054\uff0cinner<\/p>\n<p>val jointDF: DataFrame = salaries.join(employees, salaries(&#8220;id&#8221;) === employees(&#8220;id&#8221;),\u00a0 &#8220;inner\u201d)<\/p>\n<p>\u901a\u8fc7\u8fd9\u79cd\u65b9\u5f0f\uff0c\u6211\u4eec\u5f97\u5230\u7684\u7ed3\u679c\u4e3a<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"265\" height=\"127\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-44.png\" class=\"wp-image-4453\" alt=\"\u56fe\u7247\" \/><\/p>\n<p>\u53ef\u4ee5\u770b\u51fa\uff0c\u4ed6\u5206\u522b\u6dd8\u6c70\u4e86id\u4e3a4\uff0c\u4f46\u6ca1\u6709\u5458\u5de5\u4fe1\u606f\u7684\u6570\u636e\u548cid\u4e3a5\uff0c\u6ca1\u6709\u85aa\u8d44\u4fe1\u606f\u7684\u6570\u636e<\/p>\n<p>\u8fd9\u662f\u56e0\u4e3a\u5185\u5173\u8054\u5c31\u662f\u53ea\u4fdd\u5b58\u5de6\u53f3\u8868\u4e2d\u6ee1\u8db3\u5173\u8054\u6761\u4ef6\u7684\u6570\u636e\uff0c\u7531\u4e8e\u53ea\u67091 2 3 \u5206\u522b\u6709\u6570\u636e\uff0c\u6240\u4ee5\u7ed3\u679c\u4e2d\u4e5f\u53ea\u6709 1 2 3<\/p>\n<p>\u5176\u6b21\u662f\u5916\u5173\u8054<\/p>\n<p>\u5206\u522b\u662f\u5de6\u5916\u5173\u8054\uff0c\u53f3\u5916\u5173\u8054\uff0c\u5168\u5916\u5173\u8054<\/p>\n<p>\u5bf9\u4e8e\u5de6\u5916\u5173\u8054<\/p>\n<p>\u6211\u4eec\u76f4\u63a5\u770b\u4e0b\u7ed3\u679c<\/p>\n<p>val jointDF: DataFrame = salaries.join(employees, salaries(&#8220;id&#8221;) === employees(&#8220;id&#8221;),\u00a0 &#8220;left\u201d)<\/p>\n<p>\u5f97\u5230\u7684\u7ed3\u679c\u5982\u4e0b<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"279\" height=\"142\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-45.png\" class=\"wp-image-4454\" alt=\"\u56fe\u7247\" \/><\/p>\n<p>\u5de6\u5916\u5173\u8054\u7684\u7ed3\u679c\u96c6\uff0c\u5c31\u662f\u5de6\u8868\u4e3a\u57fa\u51c6\u8fdb\u884c\u67e5\u8be2\uff0c\u5982\u679c\u5339\u914d\u4e0d\u5230\u53f3\u8868\u6570\u636e\uff0c\u5219\u8bbe\u4e3anull<\/p>\n<p>\u540c\u7406\uff0c\u53f3\u5916\u5173\u8054\u5219\u662f\u6b63\u597d\u5012\u8fc7\u6765<\/p>\n<p>\u5c06\u53f3\u8868\u4e2d\u5339\u914d\u4e0d\u5230\u7684\u8bbe\u7f6e\u4e3anull<\/p>\n<p>\u90a3\u4e48\u5168\u5916\u5173\u8054\u7684\u4f5c\u7528\u5c31\u5f88\u660e\u663e\u4e86<\/p>\n<p>\u5229\u7528full\u6216\u8005outer\u5173\u952e\u8bcd\u5c31\u53ef\u4ee5\u4f7f\u7528\u5916\u5173\u8054<\/p>\n<p>\u5f97\u5230\u7684\u7ed3\u679c\u96c6\u4e3a<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"296\" height=\"160\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-46.png\" class=\"wp-image-4455\" alt=\"\u56fe\u7247\" \/><\/p>\n<p>\u5c31\u662f\u5373\u4f7f\u4e0d\u5339\u914d\u4e0a\u4e5f\u5b58\u5728\u4e8e\u7ed3\u679c\u96c6\u4e2d\uff0c\u53ea\u4e0d\u8fc7\u8bbe\u7f6e\u4e3anull<\/p>\n<p>\u90a3\u4e48\u6709\u4e86\u8fd9\u4e9b\u5916\u5173\u8054\u4e4b\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u4e0b\u5173\u4e8e\u5de6\u534a\u5173\u8054\uff0c\u4ee5\u53ca\u5de6\u9006\u5173\u8054<\/p>\n<p>Left Semi Join\/Left Anti Join<\/p>\n<p>\u5b9e\u9645\u4e0a\uff0c\u5de6\u534a\u5173\u8054\u7684\u5c31\u662f\u4fdd\u7559\u4e86\uff0c\u5de6\u8868\u4e2d\u7b26\u5408\u5173\u8054\u6761\u4ef6\u7684\u6570\u636e\u8bb0\u5f55\uff0c\u5982\u4e0b\u6240\u793a<\/p>\n<p>val jointDF: DataFrame = salaries.join(employees, salaries(&#8220;id&#8221;) === employees(&#8220;id&#8221;),\u00a0 &#8220;left_semi\u201d)<\/p>\n<p>\u5f97\u5230\u7684\u6570\u636e\u96c6\u5982\u4e0b<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"127\" height=\"165\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-47.png\" class=\"wp-image-4456\" alt=\"\u56fe\u7247\" \/><\/p>\n<p>\u5de6\u9006\u5173\u8054\u5219\u662f\u6307\u4fdd\u7559\u4e0d\u7b26\u5408\u5173\u8054\u5173\u7cfb\u7684\u6570\u636e<\/p>\n<p>\u6700\u540e\u662f\u4ece\u5173\u8054\u673a\u5236\u4e0a\u8fdb\u884c\u8bb2\u8ff0\uff0c\u4e00\u5f00\u59cb\u6211\u4eec\u4e5f\u8bf4\u4e86\u6709\u4e09\u79cd\u5b9e\u73b0\u673a\u5236\uff0c\u5206\u522b\u662fNLJ,SMJ\u4ee5\u53caHJ<\/p>\n<p>\u6211\u4eec\u4ee5\u5185\u5173\u8054\u4e3a\u4f8b\uff0c\u6211\u4eec\u8fd8\u662f\u62ff\u4e0a\u9762\u7684\u6570\u636e\u8fdb\u884c\u8bb2\u89e3<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"258\" height=\"167\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-48.png\" class=\"wp-image-4457\" alt=\"\u56fe\u7247\" \/><\/p>\n<p>\u9996\u5148\u662fNLJ\uff0cNested Loop Join\uff0c\u6211\u4eec\u901a\u5e38\u4f1a\u5c06join\u6d89\u53ca\u7684\u4e24\u4e2a\u8868\u79f0\u4e3a\u9a71\u52a8\u8868\u548c\u88ab\u9a71\u52a8\u8868<\/p>\n<p>\u5b9e\u9645\u7684\u5b9e\u73b0\u5c31\u662f\uff0c\u4e24\u5c42for\u5faa\u73af\uff0c\u5728\u5916\u5c42\u5faa\u73af\u9a71\u52a8\u8868\u6570\u636e\uff0c\u5185\u5c42\u5219\u53bb\u5224\u65ad\u88ab\u9a71\u52a8\u8868\u662f\u5426\u6709\u5339\u914d\u6570\u636e<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"548\" height=\"158\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-49.png\" class=\"wp-image-4458\" alt=\"\u56fe\u7247\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-49.png 548w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-49-300x86.png 300w\" sizes=\"(max-width: 548px) 100vw, 548px\" \/><\/p>\n<p>\u8fd9\u6837\u7684\u4e00\u4e2aNLJ\u7684\u8d1f\u8d23\u5ea6\u5c31\u662fO(M*N)<\/p>\n<p>\u5176\u6b21\u662fSMJ, Sort Merge Join<\/p>\n<p>\u8fdb\u884c\u7ebf\u6392\u5e8f\uff0c\u518d\u5f52\u5e76\u3002\u7ed9\u5b9a\u53c2\u4e0e\u5173\u8054\u7684\u4e24\u4e2a\u8868\uff0cSMJ\u5219\u5148\u628a\u4ed6\u4eec\u8fdb\u884c\u6392\u5e8f\uff0c\u7136\u540e\u4f7f\u7528\u72ec\u7acb\u7684\u6e38\u6807\uff0c\u7136\u540e\u5c06\u4e24\u4e2a\u6392\u597d\u5e8f\u7684\u8868\u505a\u5f52\u5e76\u5173\u8054<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"585\" height=\"329\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-50.png\" class=\"wp-image-4459\" alt=\"\u56fe\u7247\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-50.png 585w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-50-300x169.png 300w\" sizes=\"(max-width: 585px) 100vw, 585px\" \/><\/p>\n<p>\u5177\u4f53\u6d41\u7a0b\u4e3a\uff0c\u5206\u522b\u5728\u4e24\u4e2a\u8868\u4e0a\u6709\u4e00\u4e2a\u4e0b\u6807<\/p>\n<p>\u7136\u540e\u8fdb\u884c\u5339\u914d\uff0c\u5982\u679c\u4e24\u4e2aid\u503c\u76f8\u540c\uff0c\u8fdb\u884c\u62fc\u63a5\uff0c\u5e76\u79fb\u52a8\u4e24\u8868\u7684\u4e0b\u6807<\/p>\n<p>\u5982\u679cid\u4e0d\u540c\uff0c\u5219\u79fb\u52a8id\u5c0f\u7684\u8868\u7684\u4e0b\u6807<\/p>\n<p>\u57fa\u4e8e\u8fd9\u4e2a\u6982\u5ff5\uff0c\u5f53\u67d0\u4e2a\u8868\u7684\u4e0b\u6807\u5230\u4e86\u5c3d\u5934\uff0c\u5c31\u5b8c\u6210\u5173\u8054\u4e86<\/p>\n<p>\u8fd9\u6837\u7684\u590d\u6742\u5ea6\u662fO(M+N),\u867d\u7136\u5f88\u4f4e\uff0c\u4f46\u662f\u4f9d\u8d56\u4e8e\u4e24\u4e2a\u8868\u4e8b\u5148\u6392\u597d\u5e8f\uff0c\u8fd9\u4e00\u70b9\u7684\u8981\u6c42\u5c31\u5f88\u9ad8<\/p>\n<p>\u6700\u540e\u662fHash Join<\/p>\n<p>\u5229\u7528\u4e00\u4e2aHash\u5b57\u6bb5\u8fdb\u884c\u62fc\u63a5<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"865\" height=\"225\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-51.png\" class=\"wp-image-4460\" alt=\"\u56fe\u7247\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-51.png 865w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-51-300x78.png 300w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-51-768x200.png 768w\" sizes=\"(max-width: 865px) 100vw, 865px\" \/><\/p>\n<p>HJ\u7684\u8ba1\u7b97\u9636\u6bb5\u5206\u4e3a\u4e86Build\u548cProbe\uff0cBuild\u9636\u6bb5\u5728\u88ab\u9a71\u52a8\u8868\u4e0a\u6784\u5efaHash\u5b57\u6bb5\uff0c\u7136\u540e\u5728Probe\u9636\u6bb5\uff0c\u5219\u5728\u9a71\u52a8\u8868\u4e0a\u904d\u5386\uff0c\u8ba1\u7b97Hash\u503c\uff0c\u7136\u540e\u53bb\u88ab\u9a71\u52a8\u8868\u4e0a\u67e5\u8be2\uff0c\u5f53\u4e00\u81f4\u7684\u65f6\u5019\u5219\u8fdb\u884c\u6bd4\u5bf9\u5b9e\u9645\u503c\uff0c\u5b9e\u9645\u503c\u4e00\u81f4\u5219\u62fc\u63a5\u540e\u8f93\u51fa<\/p>\n<p>\u90a3\u4e48\u5230\u6b64\u4e3a\u6b62\uff0c\u6211\u4eec\u8bf4\u4e86\u4e09\u79cdJoin\u7684\u5b9e\u73b0\u673a\u5236\uff0c\u6211\u4eec\u63a5\u4e0b\u6765\u8bf4\u4e0b\uff0c\u5982\u4f55\u5229\u7528\u8fd9\u4e9b\u673a\u5236\u8fdb\u884c\u6570\u636e\u5173\u8054<\/p>\n<p>\u4e0d\u8fc7\u5230\u6b64\u6211\u4eec\u5148\u603b\u7ed3\u4e0b\uff0c\u6211\u4eec\u8bf4\u4e0bSpark SQL\u4e2d\u652f\u6301\u7684Join\u7b97\u5b50\uff0c\u4e0d\u540c\u7b97\u5b50\u5173\u8054\u5f62\u5f0f\u4e0d\u540c\uff0c\u603b\u7ed3\u5982\u4e0b\u8868<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"865\" height=\"342\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-52.png\" class=\"wp-image-4461\" alt=\"\u56fe\u7247\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-52.png 865w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-52-300x119.png 300w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-52-768x304.png 768w\" sizes=\"(max-width: 865px) 100vw, 865px\" \/><\/p>\n<p>\u5176\u6b21\u53c8\u8bf4\u660e\u4e86Join\u7684\u4e09\u79cd\u5b9e\u73b0\u673a\u5236\uff0c\u5206\u522b\u662fNested Loop Join\uff0cSort Merge Join\u548cHash Join<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"865\" height=\"239\" src=\"\/wp-content\/uploads\/2023\/07\/unnamed-file-53.png\" class=\"wp-image-4462\" alt=\"\u56fe\u7247\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-53.png 865w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-53-300x83.png 300w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2023\/07\/unnamed-file-53-768x212.png 768w\" sizes=\"(max-width: 865px) 100vw, 865px\" \/><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6211\u4eec\u5b66\u4e60\u4e86Spark SQL\u4e2d [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[36],"tags":[],"_links":{"self":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts\/4451"}],"collection":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=4451"}],"version-history":[{"count":0,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts\/4451\/revisions"}],"wp:attachment":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=4451"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=4451"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=4451"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}