{"id":11553,"date":"2022-09-25T18:32:49","date_gmt":"2022-09-25T09:32:49","guid":{"rendered":"https:\/\/prodskill.com\/?p=11553"},"modified":"2022-10-10T20:58:28","modified_gmt":"2022-10-10T11:58:28","slug":"word-extractor-source-code-2","status":"publish","type":"post","link":"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/","title":{"rendered":"\u5206\u8bcd\u5de5\u5177\uff08\u4e94\uff09\uff1a\u5206\u8bcd\u5de5\u5177\u6e90\u7801\u8bf4\u660e\uff08\u4e8c\uff09"},"content":{"rendered":"\n<p>\uc774\uc804 \uae00\uc5d0 \uc774\uc5b4 Python\uc73c\ub85c \uad6c\ud604\ud55c \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc5d0 \ub300\ud574 \uc0b4\ud3b4\ubcf8\ub2e4.<\/p>\n\n\n\n<p>\uc774\uc804&nbsp;\uae00\uc5d0\uc11c&nbsp;\uc774\uc5b4\uc9c0\ub294&nbsp;\ub0b4\uc6a9\uc774\ub2e4.<\/p>\n\n\n\n<p><a href=\"https:\/\/prodskill.com\/word-extractor-source-code-1\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(4): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(1)<\/a><\/p>\n\n\n\n<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_82_2 counter-hierarchy ez-toc-counter ez-toc-grey ez-toc-container-direction\">\n<div class=\"ez-toc-title-container\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">&lt;&lt;\ubaa9\ucc28&gt;&gt;<\/p>\n<span class=\"ez-toc-title-toggle\"><a href=\"#\" class=\"ez-toc-pull-right ez-toc-btn ez-toc-btn-xs ez-toc-btn-default ez-toc-toggle\" aria-label=\"Toggle Table of Content\"><span class=\"ez-toc-js-icon-con\"><span class=\"\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/span><\/a><\/span><\/div>\n<nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#4_%EB%8B%A8%EC%96%B4_%EC%B6%94%EC%B6%9C_%EB%8F%84%EA%B5%AC_%EC%86%8C%EC%8A%A4%EC%BD%94%EB%93%9C\" >4. \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc<\/a><ul class='ez-toc-list-level-3' ><li class='ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#43_get_file_text_%ED%95%A8%EC%88%98\" >4.3. get_file_text \ud568\uc218<\/a><ul class='ez-toc-list-level-4' ><li class='ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#431_get_doc_text_%ED%95%A8%EC%88%98\" >4.3.1. get_doc_text&nbsp;\ud568\uc218<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#432_get_ppt_text_%ED%95%A8%EC%88%98\" >4.3.2. get_ppt_text&nbsp;\ud568\uc218<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#433_get_txt_text_%ED%95%A8%EC%88%98\" >4.3.3. get_txt_text&nbsp;\ud568\uc218<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#434_get_db_comment_text_%ED%95%A8%EC%88%98\" >4.3.4. get_db_comment_text&nbsp;\ud568\uc218<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#435_get_hwp_text_%ED%95%A8%EC%88%98\" >4.3.5. get_hwp_text&nbsp;\ud568\uc218<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-4'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#436_get_pdf_text_%ED%95%A8%EC%88%98\" >4.3.6. get_pdf_text&nbsp;\ud568\uc218<\/a><\/li><\/ul><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#44_get_word_list_%ED%95%A8%EC%88%98\" >4.4. get_word_list \ud568\uc218<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-3'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/#45_make_word_cloud_%ED%95%A8%EC%88%98\" >4.5. make_word_cloud \ud568\uc218<\/a><\/li><\/ul><\/li><\/ul><\/nav><\/div>\n<h2 class=\"wp-block-heading\" id=\"4._\ub2e8\uc5b4_\ucd94\ucd9c_\ub3c4\uad6c_\uc18c\uc2a4\ucf54\ub4dc\"><span class=\"ez-toc-section\" id=\"4_%EB%8B%A8%EC%96%B4_%EC%B6%94%EC%B6%9C_%EB%8F%84%EA%B5%AC_%EC%86%8C%EC%8A%A4%EC%BD%94%EB%93%9C\"><\/span>4. \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"4.3._get_file_text_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"43_get_file_text_%ED%95%A8%EC%88%98\"><\/span>4.3. get_file_text \ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"350\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_file_text(file_name) -> DataFrame:\n    \"\"\"\n    MS Word, PowerPoint, Text, DB Comment(Excel) file\uc5d0\uc11c text\ub97c \ucd94\ucd9c\ud558\ub294 \ud568\uc218\n    :param file_name: \ud30c\uc77c\uba85\n    :return: file\uc5d0\uc11c \ucd94\ucd9c\ud55c text(DataFrame type)\n    \"\"\"\n    df_text = DataFrame()\n    if file_name.endswith(('.doc', '.docx')):\n        df_text = get_doc_text(file_name)\n    elif file_name.endswith(('.ppt', '.pptx')):\n        df_text = get_ppt_text(file_name)\n    elif file_name.endswith('.txt'):\n        df_text = get_txt_text(file_name)\n    elif file_name.endswith(('.xls', '.xlsx', '.xlsb')):\n        df_text = get_db_comment_text(file_name)\n    return df_text<\/pre>\n\n\n\n<ul class=\"wp-block-list\"><li>357~365\ud589: \ud30c\uc77c \ud655\uc7a5\uc790\uc5d0 \ub530\ub77c \uc801\ud569\ud55c \ud568\uc218\ub97c \uc2e4\ud589\ud558\uace0 \uadf8 \uacb0\uacfc\ub97c df_text\uc5d0 \ub2f4\uc544 \ubc18\ud658\ud55c\ub2e4.<\/li><\/ul>\n\n\n\n<p>\ud30c\uc77c \ud655\uc7a5\uc790\uc5d0 \ub530\ub77c \uc2e4\ud589\ud558\ub294 \uac01 \ud568\uc218\uc5d0 \ub300\ud55c \uc124\uba85\uc740 \ub2e4\uc74c\uacfc \uac19\ub2e4.<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"4.3.1._get_doc_text_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"431_get_doc_text_%ED%95%A8%EC%88%98\"><\/span>4.3.1. get_doc_text&nbsp;\ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"193,194,198-204\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"184\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_doc_text(file_name) -> DataFrame:\n    \"\"\"\n    doc \ud30c\uc77c\uc5d0\uc11c text\ub97c \ucd94\ucd9c\ud558\uc5ec DataFrame type\uc73c\ub85c return\n    :param file_name: \uc785\ub825 \ud30c\uc77c\uba85 (str type)\n    :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\n    \"\"\"\n    # :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\uc5d0 \ud615\ud0dc\uc18c \ubd84\uc11d\uae30\ub85c \uba85\uc0ac \ucd94\ucd9c\ud55c DataFrame\n    start_time = time.time()\n    print('\\r\\nget_doc_text: %s' % file_name)\n    word_app = win32com.client.Dispatch(\"Word.Application\")\n    word_file = word_app.Documents.Open(file_name, True)\n    # result = []\n    df_text = pd.DataFrame()\n    page = 0\n    for paragraph in word_file.Paragraphs:\n        text = paragraph.Range.Text\n        page = paragraph.Range.Information(3)  # 3: wdActiveEndPageNumber(Text\uc758 \ud398\uc774\uc9c0\ubc88\ud638 \ud655\uc778)\n        if text.strip() != '':\n            sr_text = Series([file_name, 'doc', page, text, f'{file_name}:{page}:{text}'],\n                             index=['FileName', 'FileType', 'Page', 'Text', 'Source'])\n            df_text = df_text.append(sr_text, ignore_index=True)\n\n    word_file.Close()\n    print('text count: %s' % str(df_text.shape[0]))\n    print('page count: %d' % page)\n    end_time = time.time()\n    # elapsed_time = end_time - start_time\n    elapsed_time = str(datetime.timedelta(seconds=end_time - start_time))\n    print('[pid:%d] get_doc_text elapsed time: %s' % (os.getpid(), elapsed_time))\n    # return get_word_list(df_text)\n    return df_text<\/pre>\n\n\n\n<ul class=\"wp-block-list\"><li>193\ud589: win32com package\ub85c MS Word \ud504\ub85c\uadf8\ub7a8\uc758 instance\ub97c \uc0dd\uc131\ud55c\ub2e4. MS Word\uac00 \uc2e4\ud589\ub418\uc9c0 \uc54a\uc740 \uc0c1\ud0dc\ub77c\uba74 \uc774 \ucf54\ub4dc\ub85c \uc2e4\ud589\ub41c\ub2e4.<\/li><li>194\ud589: \uc704\uc5d0\uc11c \uc0dd\uc131\ud55c MS Word \ud504\ub85c\uadf8\ub7a8 instance\uc5d0\uc11c .doc \ub610\ub294 .docx \ud30c\uc77c\uc744 \uc5f0\ub2e4.<\/li><li>198\ud589: \ud30c\uc77c \ub0b4\uc6a9\uc758 \ub2e8\ub77d(Paragraph)\uc744 \uc21c\ud68c\ud55c\ub2e4.<\/li><li>199\ud589: \ub2e8\ub77d\uc758 \ub0b4\uc6a9\uc911 \ud14d\uc2a4\ud2b8\ub97c \ucd94\ucd9c\ud55c\ub2e4.<\/li><li>200\ud589: \ud604\uc7ac \ub2e8\ub77d\uc758 \ud398\uc774\uc9c0 \ubc88\ud638\ub97c \ucd94\ucd9c\ud55c\ub2e4. Range.Information(3)\uc5d0\uc11c &#8220;3&#8221;\uc740 wdActiveEndPageNumber \uc0c1\uc218\uac12\uc5d0 \ud574\ub2f9\ud55c\ub2e4. \uc790\uc138\ud55c \ub0b4\uc6a9\uc740&nbsp;<a href=\"https:\/\/docs.microsoft.com\/en-us\/office\/vba\/api\/word.wdinformation\" target=\"_blank\" rel=\"noreferrer noopener\">WdInformation enumeration (Word) | Microsoft Docs<\/a>\ub97c \ucc38\uc870\ud55c\ub2e4.<\/li><li>202~204\ud589: \ucd94\ucd9c\ud55c \ud14d\uc2a4\ud2b8\ub97c Series \uac1c\uccb4\ub85c \ub9cc\ub4e4\uace0, df_text DataFrame\uc758 \ud589\uc5d0 \ucd94\uac00\ud55c\ub2e4.<\/li><li>214\ud589: \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c \ud14d\uc2a4\ud2b8\uac00 \ub2f4\uaca8\uc788\ub294 df_text\ub97c \ubc18\ud658\ud55c\ub2e4.<\/li><\/ul>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"4.3.2._get_ppt_text_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"432_get_ppt_text_%ED%95%A8%EC%88%98\"><\/span>4.3.2. get_ppt_text&nbsp;\ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"138-139,143,149,168,163-167\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"129\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_ppt_text(file_name) -> DataFrame:\n    \"\"\"\n    ppt \ud30c\uc77c\uc5d0\uc11c text\ub97c \ucd94\ucd9c\ud558\uc5ec DataFrame type\uc73c\ub85c return\n    :param file_name: \uc785\ub825 \ud30c\uc77c\uba85 (str type)\n    :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\n    \"\"\"\n    # :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\uc5d0 \ud615\ud0dc\uc18c \ubd84\uc11d\uae30\ub85c \uba85\uc0ac \ucd94\ucd9c\ud55c DataFrame\n    start_time = time.time()\n    print('\\r\\nget_ppt_text: %s' % file_name)\n    ppt_app = win32com.client.Dispatch('PowerPoint.Application')\n    ppt_file = ppt_app.Presentations.Open(file_name, True)\n    # result = []\n    df_text = pd.DataFrame()\n    page_count = 0\n    for slide in ppt_file.Slides:\n        slide_number = slide.SlideNumber\n        page_count += 1\n        for shape in slide.Shapes:\n            shape_text = []\n            text = ''\n            if shape.HasTable:\n                col_cnt = shape.Table.Columns.Count\n                row_cnt = shape.Table.Rows.Count\n                for row_idx in range(1, row_cnt + 1):\n                    for col_idx in range(1, col_cnt + 1):\n                        text = shape.Table.Cell(row_idx, col_idx).Shape.TextFrame.TextRange.Text\n                        if text != '':\n                            text = text.replace('\\r', ' ')\n                            shape_text.append(text)\n            elif shape.HasTextFrame:\n                for paragraph in shape.TextFrame.TextRange.Paragraphs():\n                    text = paragraph.Text\n                    if text != '':\n                        shape_text.append(text)\n            for text in shape_text:\n                if text.strip() != '':\n                    sr_text = Series([file_name, 'ppt', slide_number, text, f'{file_name}:{slide_number}:{text}'],\n                                     index=['FileName', 'FileType', 'Page', 'Text', 'Source'])\n                    df_text = df_text.append(sr_text, ignore_index=True)\n    # print(result)\n    ppt_file.Close()\n    # print(df_result)\n    print('text count: %s' % str(df_text.shape[0]))\n    print('page count: %d' % page_count)\n    # print(df_text.head(10))\n    # print(df_result.Paragraph)\n    # return df_result\n    end_time = time.time()\n    # elapsed_time = end_time - start_time\n    elapsed_time = str(datetime.timedelta(seconds=end_time - start_time))\n    print('[pid:%d] get_ppt_text elapsed time: %s' % (os.getpid(), elapsed_time))\n    # return get_word_list(df_text)\n    return df_text<\/pre>\n\n\n\n<ul class=\"wp-block-list\"><li>138\ud589: win32com package\ub85c MS Powerpoint \ud504\ub85c\uadf8\ub7a8\uc758 instance\ub97c \uc0dd\uc131\ud55c\ub2e4. MS Powerpoint\uac00 \uc2e4\ud589\ub418\uc9c0 \uc54a\uc740 \uc0c1\ud0dc\ub77c\uba74 \uc774 \ucf54\ub4dc\ub85c \uc2e4\ud589\ub41c\ub2e4.<\/li><li>139\ud589: \uc704\uc5d0\uc11c \uc0dd\uc131\ud55c MS Powerpoint \ud504\ub85c\uadf8\ub7a8 instance\uc5d0\uc11c .ppt \ub610\ub294 .pptx \ud30c\uc77c\uc744 \uc5f0\ub2e4.<\/li><li>143\ud589: \ud30c\uc77c\uc758 \uc2ac\ub77c\uc774\ub4dc(Slide)\ub97c \uc21c\ud68c\ud55c\ub2e4.<\/li><li>146\ud589: \uac01 \uc2ac\ub77c\uc774\ub4dc\uc758 \ub3c4\ud615(Shape)\uc744 \uc21c\ud68c\ud55c\ub2e4.<\/li><li>149~157\ud589: \ub3c4\ud615\uc774 \ud45c(Table)\uc778 \uacbd\uc6b0 \ud45c\uc758 \uac01 cell\uc5d0\uc11c \ud14d\uc2a4\ud2b8\ub97c \ucd94\ucd9c\ud55c\ub2e4.<\/li><li>158~162\ud589: \ub3c4\ud615\uc774 \ud45c(Table)\uac00 \uc544\ub2c8\uace0 Text\ub97c \uac00\uc9c0\uace0 \uc788\ub294 \uacbd\uc6b0 \ud14d\uc2a4\ud2b8\ub97c \ucd94\ucd9c\ud55c\ub2e4.<\/li><li>163~167\ud589: \ucd94\ucd9c\ud55c&nbsp;\ud14d\uc2a4\ud2b8\ub97c&nbsp;Series&nbsp;\uac1c\uccb4\ub85c&nbsp;\ub9cc\ub4e4\uace0,&nbsp;df_text&nbsp;DataFrame\uc758&nbsp;\ud589\uc5d0&nbsp;\ucd94\uac00\ud55c\ub2e4.<\/li><li>181\ud589: \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c \ud14d\uc2a4\ud2b8\uac00 \ub2f4\uaca8\uc788\ub294 df_text\ub97c \ubc18\ud658\ud55c\ub2e4.<\/li><\/ul>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"4.3.3._get_txt_text_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"433_get_txt_text_%ED%95%A8%EC%88%98\"><\/span>4.3.3. get_txt_text&nbsp;\ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"228,231-234\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"217\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_txt_text(file_name) -> DataFrame:\n    \"\"\"\n    txt \ud30c\uc77c\uc5d0\uc11c text\ub97c \ucd94\ucd9c\ud558\uc5ec DataFrame type\uc73c\ub85c return\n    :param file_name: \uc785\ub825 \ud30c\uc77c\uba85 (str type)\n    :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\n    \"\"\"\n    # :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\uc5d0 \ud615\ud0dc\uc18c \ubd84\uc11d\uae30\ub85c \uba85\uc0ac \ucd94\ucd9c\ud55c DataFrame\n    start_time = time.time()\n    print('\\r\\nget_txt_text: ' + file_name)\n    df_text = pd.DataFrame()\n    line_number = 0\n    with open(file_name, 'rt', encoding='UTF8') as file:\n        for text in file:\n            line_number += 1\n            if text.strip() != '':\n                sr_text = Series([file_name, 'txt', line_number, text, f'{file_name}:{line_number}:{text}'],\n                                 index=['FileName', 'FileType', 'Page', 'Text', 'Source'])\n                df_text = df_text.append(sr_text, ignore_index=True)\n    print('text count: %d' % df_text.shape[0])\n    print('line count: %d' % line_number)\n    end_time = time.time()\n    # elapsed_time = end_time - start_time\n    elapsed_time = str(datetime.timedelta(seconds=end_time - start_time))\n    print('[pid:%d] get_txt_text elapsed time: %s' % (os.getpid(), elapsed_time))\n    # return get_word_list(df_text)\n    return df_text<\/pre>\n\n\n\n<ul class=\"wp-block-list\"><li>228\ud589: file\uc744 UTF8 \uc778\ucf54\ub529\uc758 \uc77d\uae30 \uc804\uc6a9 \ud14d\uc2a4\ud2b8 \ubaa8\ub4dc\ub85c \uc5f0\ub2e4. (mode=&#8217;rt&#8217;)<\/li><li>229\ud589: file\uc758 \ud589(line)\uc744 \uc21c\ud68c\ud55c\ub2e4.<\/li><li>231~234\ud589: \ud589 \ud14d\uc2a4\ud2b8\ub97c Series \uac1c\uccb4\ub85c \ub9cc\ub4e4\uace0, df_text DataFrame\uc758 \ud589\uc5d0 \ucd94\uac00\ud55c\ub2e4.<\/li><li>242\ud589: \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c \ud14d\uc2a4\ud2b8\uac00 \ub2f4\uaca8\uc788\ub294 df_text\ub97c \ubc18\ud658\ud55c\ub2e4.<\/li><\/ul>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"4.3.4._get_db_comment_text_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"434_get_db_comment_text_%ED%95%A8%EC%88%98\"><\/span>4.3.4. get_db_comment_text&nbsp;\ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"300,302,306,309-311,323,326-328,339\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"291\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_db_comment_text(file_name) -> DataFrame:\n    \"\"\"\n    db_comment \ud30c\uc77c\uc5d0\uc11c text\ub97c \ucd94\ucd9c\ud558\uc5ec DataFrame type\uc73c\ub85c return\n    :param file_name:  \uc785\ub825 \ud30c\uc77c\uba85 (str type)\n    :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\n    \"\"\"\n    # :return: \uc785\ub825 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text\uc5d0 \ud615\ud0dc\uc18c \ubd84\uc11d\uae30\ub85c \uba85\uc0ac \ucd94\ucd9c\ud55c DataFrame\n    start_time = time.time()\n    print('\\r\\nget_db_comment_text: %s' % file_name)\n    excel_app = win32com.client.Dispatch('Excel.Application')\n    full_path_file_name = os.path.abspath(file_name)\n    excel_file = excel_app.Workbooks.Open(full_path_file_name, True)\n\n    # region Table comment\n    table_comment_sheet = excel_file.Worksheets(1)\n    last_row = table_comment_sheet.Range(\"A1\").End(-4121).Row  # -4121: xlDown\n    table_comment_range = 'A2:D%s' % (str(last_row))\n    print('table_comment_range : %s (%d rows)' % (table_comment_range, last_row - 1))\n    table_comments = table_comment_sheet.Range(table_comment_range).Value2\n    df_table = pd.DataFrame(list(table_comments),\n                            columns=['DB', 'Schema', 'Table', 'Text'])\n    df_table['FileName'] = full_path_file_name\n    df_table['FileType'] = 'table'\n    df_table['Page'] = 0\n    df_table = df_table[df_table.Text.notnull()]  # Text \uac12\uc774 \uc5c6\ub294 \ud589 \uc81c\uac70\n    df_table['Source'] = df_table['DB'] + '.' + df_table['Schema'] + '.' + df_table['Table'] \\\n                         + '(' + df_table['Text'].astype(str) + ')'\n    # print(df_table)\n    # endregion\n\n    # region Column comment\n    column_comment_sheet = excel_file.Worksheets(2)\n    last_row = column_comment_sheet.Range(\"A1\").End(-4121).Row  # -4121: xlDown\n    column_comment_range = 'A2:E%s' % (str(last_row))\n    print('column_comment_range : %s (%d rows)' % (column_comment_range, last_row - 1))\n    column_comments = column_comment_sheet.Range(column_comment_range).Value2\n    df_column = pd.DataFrame(list(column_comments),\n                             columns=['DB', 'Schema', 'Table', 'Column', 'Text'])\n    df_column['FileName'] = full_path_file_name\n    df_column['FileType'] = 'column'\n    df_column['Page'] = 0\n    df_column = df_column[df_column.Text.notnull()]  # Text \uac12\uc774 \uc5c6\ub294 \ud589 \uc81c\uac70\n    df_column['Source'] = df_column['DB'] + '.' + df_column['Schema'] + '.' + df_column['Table'] \\\n                          + '.' + df_column['Column'] + '(' + df_column['Text'].astype(str) + ')'\n    # print(df_column)\n    # endregion\n\n    excel_file.Close()\n    df_text = df_column.append(df_table, ignore_index=True)\n    # print(df_text)\n    end_time = time.time()\n    # elapsed_time = end_time - start_time\n    elapsed_time = str(datetime.timedelta(seconds=end_time - start_time))\n    print('[pid:%d] get_db_comment_text elapsed time: %s' % (os.getpid(), elapsed_time))\n    print('text count: %s' % str(df_text.shape[0]))\n    # return get_word_list(df_text)\n    return df_text<\/pre>\n\n\n\n<ul class=\"wp-block-list\"><li>300\ud589:&nbsp;win32com package\ub85c MS Excel \ud504\ub85c\uadf8\ub7a8\uc758 instance\ub97c \uc0dd\uc131\ud55c\ub2e4. MS Excel\uc774 \uc2e4\ud589\ub418\uc9c0 \uc54a\uc740 \uc0c1\ud0dc\ub77c\uba74 \uc774 \ucf54\ub4dc\ub85c \uc2e4\ud589\ub41c\ub2e4.<\/li><li>302\ud589: \uc704\uc5d0\uc11c \uc0dd\uc131\ud55c MS Excel \ud504\ub85c\uadf8\ub7a8 instance\uc5d0\uc11c .xls \ub610\ub294 .xlsx \ud30c\uc77c\uc744 \uc5f0\ub2e4.<\/li><li>305~317\ud589: \ud14c\uc774\ube14 comment\uac00 \uc800\uc7a5\ub418\uc5b4 \uc788\ub294 \uc5d1\uc140 \ud30c\uc77c\uc758 \uccab \ubc88\uc9f8 \uc2dc\ud2b8\uc5d0\uc11c Text\ub97c \ucd94\ucd9c\ud55c\ub2e4.<\/li><li>306\ud589: \ud14c\uc774\ube14 comment \uc2dc\ud2b8\uc758 \uac00\uc7a5 \ub9c8\uc9c0\ub9c9 \ud589 \ubc88\ud638\ub97c \uad6c\ud55c\ub2e4. Range(&#8220;A1&#8221;).End(-4211).Row \uc5d0\uc11c &#8220;-4211&#8243;\uc740 &#8220;xlDown&#8221; \uc0c1\uc218\uc774\ub2e4. \uc790\uc138\ud55c \ub0b4\uc6a9\uc740&nbsp;<a href=\"https:\/\/docs.microsoft.com\/en-us\/office\/vba\/api\/excel.xldirection\" target=\"_blank\" rel=\"noreferrer noopener\">XlDirection enumeration (Excel) | Microsoft Docs<\/a>&nbsp;\ubb38\uc11c\ub97c \ucc38\uc870\ud55c\ub2e4.<\/li><li>309\ud589: \ud14c\uc774\ube14 comment \uc2dc\ud2b8\uc758 \ub0b4\uc6a9\uc744 table_comments \ubcc0\uc218\ub85c \uc77d\ub294\ub2e4. \uc774 \ubc29\ubc95\uc740 Loop\ub97c \uc0ac\uc6a9\ud558\uc9c0 \uc54a\uace0 Range\uc758 \ub0b4\uc6a9\uc744 \ud55c \ubc88\uc5d0 memory\ub85c \uc77d\ub294 \ubc29\ubc95\uc774\ub2e4.&nbsp;<a href=\"https:\/\/prodskill.com\/excel-vba-coding-pattern-range-loop-read\/\">VBA \ucf54\ub529 \ud328\ud134: Range Loop-\uc77d\uae30(Read)<\/a>&nbsp;\ub0b4\uc6a9\uc744 \ucc38\uc870\ud55c\ub2e4. \uc774 \uae00\uc740 \uc5d1\uc140 VBA\ub85c \uc124\uba85\ub418\uc5b4 \uc788\uc73c\ub098 Python\uc5d0\uc11c\ub3c4 OLE Automation\uc744 \uc0ac\uc6a9\ud558\uba74 \uac70\uc758 \ub3d9\uc77c\ud558\uac8c \uc801\uc6a9\ud560 \uc218 \uc788\ub2e4.<\/li><li>310~317\ud589: table_comments\ub97c DataFrame df_table\ub85c \ubcc0\ud658\ud558\uace0 &#8216;FileName&#8217;, &#8216;FileType&#8217;, &#8216;Page&#8217;, &#8216;Source&#8217; \uc5f4(column)\uc758 \ub370\uc774\ud130\ub97c \ucd94\uac00\ud55c\ub2e4.<\/li><li>322~334\ud589: \uceec\ub7fc&nbsp;comment\uac00 \uc800\uc7a5\ub418\uc5b4 \uc788\ub294 \uc5d1\uc140 \ud30c\uc77c\uc758 \ub450 \ubc88\uc9f8 \uc2dc\ud2b8\uc5d0\uc11c Text\ub97c \ucd94\ucd9c\ud55c\ub2e4.<\/li><li>323\ud589: \uceec\ub7fc comment \uc2dc\ud2b8\uc758 \uac00\uc7a5 \ub9c8\uc9c0\ub9c9 \ud589 \ubc88\ud638\ub97c \uad6c\ud55c\ub2e4. 306\ud589\uacfc \ub3d9\uc77c\ud55c \ubc29\ubc95\uc744 \uc0ac\uc6a9\ud55c\ub2e4.<\/li><li>326\ud589: \uceec\ub7fc comment \uc2dc\ud2b8\uc758 \ub0b4\uc6a9\uc744 column_comments \ubcc0\uc218\ub85c \uc77d\ub294\ub2e4. 309\ud589\uacfc \ub3d9\uc77c\ud55c \ubc29\ubc95\uc744 \uc0ac\uc6a9\ud55c\ub2e4.<\/li><li>327~334\ud589: column_comments\ub97c DataFrame df_column\uc73c\ub85c \ubcc0\ud658\ud558\uace0 &#8216;FileName&#8217;, &#8216;FileType&#8217;, &#8216;Page&#8217;, &#8216;Source&#8217; \uc5f4(column)\uc758 \ub370\uc774\ud130\ub97c \ucd94\uac00\ud55c\ub2e4.<\/li><li>339\ud589: df_column\uacfc df_table\uc744 \ud569\uccd0\uc11c df_text\ub97c \ub9cc\ub4e0\ub2e4.<\/li><li>347\ud589: DB \ud14c\uc774\ube14, \uceec\ub7fc comment\uac00 \uc800\uc7a5\ub41c \uc5d1\uc140 \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c \ud14d\uc2a4\ud2b8\uac00&nbsp;\ub2f4\uaca8\uc788\ub294&nbsp;df_text\ub97c&nbsp;\ubc18\ud658\ud55c\ub2e4.<\/li><\/ul>\n\n\n\n<h4 class=\"wp-block-heading\"><span class=\"ez-toc-section\" id=\"435_get_hwp_text_%ED%95%A8%EC%88%98\"><\/span><strong>4.3.5. get_hwp_text&nbsp;\ud568\uc218<\/strong><span class=\"ez-toc-section-end\"><\/span><\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"281\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_hwp_text(file_name) -> DataFrame:\n    pass<\/pre>\n\n\n\n<p>\ud604\uc7ac \uad6c\ud604\ub418\uc5b4 \uc788\uc9c0 \uc54a\ub2e4. \ud5a5\ud6c4 \ud544\uc694\uc2dc \uad6c\ud604 \uc608\uc815\uc774\ub2e4.<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"4.3.6._get_pdf_text_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"436_get_pdf_text_%ED%95%A8%EC%88%98\"><\/span>4.3.6. get_pdf_text&nbsp;\ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h4>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"286\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_pdf_text(file_name) -> DataFrame:\n    pass<\/pre>\n\n\n\n<p>\ud604\uc7ac&nbsp;\uad6c\ud604\ub418\uc5b4&nbsp;\uc788\uc9c0&nbsp;\uc54a\ub2e4.&nbsp;\ud5a5\ud6c4&nbsp;\ud544\uc694\uc2dc&nbsp;\uad6c\ud604 \uc608\uc815\uc774\ub2e4.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"4.4._get_word_list_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"44_get_word_list_%ED%95%A8%EC%88%98\"><\/span>4.4. get_word_list \ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<p>\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c\uc5d0\uc11c \uac00\uc7a5 \ud575\uc2ec\uc778 \ud568\uc218\uc774\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"35,38,64-65,70,84,88\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"26\" data-enlighter-title=\"\" data-enlighter-group=\"\">def get_word_list(df_text) -> DataFrame:\n    \"\"\"\n    text \ucd94\ucd9c\uacb0\uacfc DataFrame\uc5d0\uc11c \uba85\uc0ac\ub97c \ucd94\ucd9c\ud558\uc5ec \ucd5c\uc885 output\uc744 DataFrame type\uc73c\ub85c return\n    :param df_text: \ud30c\uc77c\uc5d0\uc11c \ucd94\ucd9c\ud55c text(DataFrame type)\n    :return: \uba85\uc0ac, \ubcf5\ud569\uc5b4(1\uac1c \uc774\uc0c1\uc758 \uba85\uc0ac, \uc811\ub450\uc0ac+\uba85\uc0ac+\uc811\ubbf8\uc0ac) \ucd94\ucd9c\uacb0\uacfc(Dataframe type)\n    \"\"\"\n    start_time = time.time()\n    df_result = DataFrame()\n\n    tagger = Mecab()\n    # tagger = Komoran()\n    row_idx = 0\n    for index, row in df_text.iterrows():\n        row_idx += 1\n        if row_idx % 100 == 0:  # 100\uac74\ub9c8\ub2e4 \ud604\uc7ac \uc9c4\ud589\uc0c1\ud0dc \ucd9c\ub825\n            print('[pid:%d] current: %d, total: %d, progress: %3.2f%%' %\n                  (os.getpid(), row_idx, df_text.shape[0], round(row_idx \/ df_text.shape[0] * 100, 2)))\n        file_name = row['FileName']\n        file_type = row['FileType']\n        page = row['Page']\n        text = str(row['Text'])\n        source = (row['Source'])\n        is_db = True if row['FileType'] in ('table', 'column') else False\n        is_db_table = True if row['FileType'] == 'table' else False\n        is_db_column = True if row['FileType'] == 'column' else False\n        if is_db:\n            db = row['DB']\n            schema = row['Schema']\n            table = row['Table']\n            if is_db_column:\n                column = row['Column']\n\n        if text is None or text.strip() == '':\n            continue\n        try:\n            # nouns = mecab.nouns(text)\n            # [O]ToDo: \uc5f0\uc18d\ub41c \uccb4\uc5b8\uc811\ub450\uc0ac(XPN), \uba85\uc0ac\ud30c\uc0dd\uc811\ubbf8\uc0ac(XSN) \uae4c\uc9c0 \ud3ec\ud568\ud558\uc5ec \ucd94\ucd9c\n            # [O]ToDo: \uba85\uc0ac(NNG, NNP)\uac00 \uc5f0\uc18d\ub420 \ub54c \uac01\uac01 \uba85\uc0ac\uc640 \uc5f0\uacb0\ub41c \ubcf5\ud569\uba85\uc0ac \ud568\uaed8 \ucd94\ucd9c\n            text_pos = tagger.pos(text)\n            words = [pos for pos, tag in text_pos if tag in ['NNG', 'NNP', 'SL']]  # NNG: \uc77c\ubc18\uba85\uc0ac, NNP: \uace0\uc720\uba85\uc0ac\n            pos_list = [x for (x, y) in text_pos]\n            tag_list = [y for (x, y) in text_pos]\n            pos_str = '\/'.join(pos_list) + '\/'\n            tag_str = '\/'.join(tag_list) + '\/'\n            iterator = re.finditer('(NNP\/|NNG\/)+(XSN\/)*|(XPN\/)+(NNP\/|NNG\/)+(XSN\/)*|(SL\/)+', tag_str)\n            for mo in iterator:\n                x, y = mo.span()\n                if x == 0:\n                    start_idx = 0\n                else:\n                    start_idx = tag_str[:x].count('\/')\n                end_idx = tag_str[:y].count('\/')\n                sub_pos = ''\n                # if end_idx - start_idx > 1 and not (start_idx == 0 and end_idx == len(tag_list)):\n                if end_idx - start_idx > 1:\n                    for i in range(start_idx, end_idx):\n                        sub_pos += pos_list[i]\n                    # print('%s[sub_pos]' % sub_pos)\n                    words.append('%s[\ubcf5\ud569\uc5b4]' % sub_pos)  # \ucd94\uac00 \ud615\ud0dc\uc18c \ub4f1\ub85d\n\n            if len(words) >= 1:\n                # print(nouns, text)\n                for word in words:\n                    # print(noun, '\\t', text)\n                    if not is_db:\n                        # sr_text = Series([file_name, file_type, page, text, word],\n                        #                  index=['FileName', 'FileType', 'Page', 'Text', 'Word'])\n                        df_word = DataFrame(\n                            {'FileName': [file_name], 'FileType': [file_type], 'Page': [page], 'Text': [text],\n                             'Word': [word], 'Source': [source]})\n                    elif is_db_table:\n                        # sr_text = Series([file_name, file_type, page, text, word, db, schema, table],\n                        #                  index=['FileName', 'FileType', 'Page', 'Text', 'Word', 'DB', 'Schema', 'Table'])\n                        df_word = DataFrame(\n                            {'FileName': [file_name], 'FileType': [file_type], 'Page': [page], 'Text': [text],\n                             'Word': [word], 'DB': [db], 'Schema': [schema], 'Table': [table \u201c\u201d not found \/]<br \/>\n, 'Source': [source]})\n                    elif is_db_column:\n                        # sr_text = Series([file_name, file_type, page, text, word, db, schema, table, column],\n                        #                  index=['FileName', 'FileType', 'Page', 'Text', 'Word', 'DB', 'Schema', 'Table', 'Column'])\n                        df_word = DataFrame(\n                            {'FileName': [file_name], 'FileType': [file_type], 'Page': [page], 'Text': [text],\n                             'Word': [word], 'DB': [db], 'Schema': [schema], 'Table': [table \u201c\u201d not found \/]<br \/>\n, 'Column': [column],\n                             'Source': [source]})\n                    # df_result = df_result.append(sr_text, ignore_index=True)  # Todo: append\ub97c concat\uc73c\ub85c \ubc14\uafb8\uae30\n                    df_result = pd.concat([df_result, df_word], ignore_index=True)\n        except Exception as ex:\n            print('[pid:%d] Exception has raised for text: %s' % (os.getpid(), text))\n            print(ex)\n\n    print(\n        '[pid:%d] input text count:%d, extracted word count: %d' % (os.getpid(), df_text.shape[0], df_result.shape[0]))\n    end_time = time.time()\n    # elapsed_time = end_time - start_time\n    elapsed_time = str(datetime.timedelta(seconds=end_time - start_time))\n    print('[pid:%d] get_word_list finished. total: %d, elapsed time: %s' %\n          (os.getpid(), df_text.shape[0], elapsed_time))\n    return df_result<\/pre>\n\n\n\n<ul class=\"wp-block-list\"><li>35\ud589: \uc790\uc5f0\uc5b4 \ud615\ud0dc\uc18c \ubd84\uc11d\uae30 Mecab \uac1d\uccb4\ub97c \uc0dd\uc131\ud55c\ub2e4. Mecab\uc774 \uc544\ub2cc \ub2e4\ub978 tagger\ub97c \uc0ac\uc6a9\ud558\ub824\uba74 \uc5ec\uae30\uc5d0\uc11c package\uba85\uc744 \ubcc0\uacbd\ud55c\ub2e4.<\/li><li>38\ud589: DataFrame df_text\uc758 \ud589\uc744 \uc21c\ud68c\ud55c\ub2e4.<\/li><li>64\ud589: pos \ud568\uc218\ub85c \ud615\ud0dc\uc18c \ubd84\uc11d\uae30\uc758 \ud488\uc0ac tagging\uc744 \uc2e4\ud589\ud55c\ub2e4. \ud488\uc0ac tagging\uacfc \uad00\ub828\ud55c \ub0b4\uc6a9\uc740 \ubcc4\ub3c4\ub85c \uc815\ub9ac\ud558\uaca0\ub2e4.<ul><li>\ud488\uc0ac tagging \ud568\uc218 pos\ub294 \uc785\ub825 \ubb38\uc790\uc5f4\uc744 \ud488\uc0ac \ub2e8\uc704\ub85c \ubd84\ud574\ud558\uace0 \uac01 \ub2e8\uc704\uac00 \uc5b4\ub5a4 \ud488\uc0ac\uc778\uc9c0 \ud45c\uc2dc(tagging)\ud55c \ubb38\uc790\uc5f4\uc744 \ubc18\ud658\ud55c\ub2e4.<\/li><li>\uc608\ub97c \ub4e4\uc5b4, text\uac00 &#8216;\uc0ac\uc6a9\uc790\ub294 \uae30\ub2a5\uc801 \uc694\uad6c\uc0ac\ud56d\uacfc \ube44\uae30\ub2a5\uc801 \uc694\uad6c\uc0ac\ud56d\uc744 \uc815\uc758\ud55c\ub2e4.&#8217;\uc778 \uacbd\uc6b0, pos \ud568\uc218\uc758 \uc2e4\ud589\uacb0\uacfc\ub294 &#8216;[(&#8216;\uc0ac\uc6a9&#8217;, &#8216;NNG&#8217;), (&#8216;\uc790&#8217;, &#8216;XSN&#8217;), (&#8216;\ub294&#8217;, &#8216;JX&#8217;), (&#8216;\uae30\ub2a5&#8217;, &#8216;NNG&#8217;), (&#8216;\uc801&#8217;, &#8216;XSN&#8217;), (&#8216;\uc694\uad6c&#8217;, &#8216;NNG&#8217;), (&#8216;\uc0ac\ud56d&#8217;, &#8216;NNG&#8217;), (&#8216;\uacfc&#8217;, &#8216;JC&#8217;), (&#8216;\ube44&#8217;, &#8216;XPN&#8217;), (&#8216;\uae30\ub2a5&#8217;, &#8216;NNG&#8217;), (&#8216;\uc801&#8217;, &#8216;XSN&#8217;), (&#8216;\uc694\uad6c&#8217;, &#8216;NNG&#8217;), (&#8216;\uc0ac\ud56d&#8217;, &#8216;NNG&#8217;), (&#8216;\uc744&#8217;, &#8216;JKO&#8217;), (&#8216;\uc815\uc758&#8217;, &#8216;NNG&#8217;), (&#8216;\ud55c\ub2e4&#8217;, &#8216;XSV+EF&#8217;), (&#8216;.&#8217;, &#8216;SF&#8217;)]&#8217;\uc774\ub2e4.<\/li><li>\uc704 \uc608\uc2dc\uc5d0 tagging \ub41c \ud488\uc0ac \uc911 &#8216;NNG&#8217;\ub294 \uc77c\ubc18 \uba85\uc0ac, &#8216;XSN&#8217;\ub294 \uba85\uc0ac \ud30c\uc0dd \uc811\ubbf8\uc0ac, &#8216;JX&#8217;\ub294 \ubcf4\uc870\uc0ac, &#8216;JC&#8217;\ub294 \uc811\uc18d \uc870\uc0ac, &#8216;XPN&#8217;\uc740 \uccb4\uc5b8 \uc811\ub450\uc0ac, &#8216;JKO&#8217;\ub294 \ubaa9\uc801\uaca9 \uc870\uc0ac, &#8216;XSV+EF&#8217;\ub294 \ub3d9\uc0ac \ud30c\uc0dd \uc811\ubbf8\uc0ac+\uc885\uacb0 \uc5b4\ubbf8, &#8216;SF&#8217;\ub294 \ub9c8\uce68\ud45c\/\ubb3c\uc74c\ud45c\/\ub290\ub08c\ud45c\ub97c \uc758\ubbf8\ud55c\ub2e4.<\/li><\/ul><\/li><li>65\ud589: \ud488\uc0ac tagging \uacb0\uacfc\uc5d0\uc11c \ud45c\uc900 \ub2e8\uc5b4 \ud6c4\ubcf4\ub85c \uac00\uc7a5 \uc801\ud569\ud55c \ud488\uc0ac\uc778 \uc77c\ubc18 \uba85\uc0ac(NNG), \uace0\uc720 \uba85\uc0ac(NNP), \uc678\uad6d\uc5b4(SL)\ub97c \uace8\ub77c\ub0b8\ub2e4. \uc678\uad6d\uc5b4(SL)\ub294 \uc54c\ud30c\ubcb3\uc73c\ub85c \uad6c\uc131\ub41c \uc57d\uc5b4\ub97c \ud45c\uc900 \ub2e8\uc5b4 \ud6c4\ubcf4\ub85c \ucd94\ucd9c\ud558\uae30 \uc704\ud574 \uc9c0\uc815\ud558\uc600\ub2e4.<\/li><li>70\ud589: \uc815\uaddc\ud45c\ud604\uc2dd(regula expression)\uc744 \uc774\uc6a9\ud558\uc5ec &#8216;(NNP\/|NNG\/)+(XSN\/)*|(XPN\/)+(NNP\/|NNG\/)+(XSN\/)*|(SL\/)+&#8217; \ud328\ud134\uc744 \ucc3e\ub294\ub2e4.<ul><li>\uc774 \ud328\ud134\uc740 \ub2e4\uc74c \uc138 \uac00\uc9c0 \uc911 \ud558\ub098\ub97c \ucc3e\uc544\ub0b8\ub2e4.<ul><li>(NNP\/|NNG\/)+(XSN\/)*: (\uace0\uc720 \uba85\uc0ac \ub610\ub294 \uc77c\ubc18\uba85\uc0ac) 1\uac1c \uc774\uc0c1 + \uba85\uc0ac \ud30c\uc0dd \uc811\ubbf8\uc0ac 0\uac1c \uc774\uc0c1<\/li><li>(XPN\/)+(NNP\/|NNG\/)+(XSN\/)*: \uccb4\uc5b8 \uc811\ub450\uc0ac 1\uac1c \uc774\uc0c1 + (\uace0\uc720&nbsp;\uba85\uc0ac&nbsp;\ub610\ub294&nbsp;\uc77c\ubc18\uba85\uc0ac)&nbsp;1\uac1c&nbsp;\uc774\uc0c1&nbsp;+&nbsp;\uba85\uc0ac&nbsp;\ud30c\uc0dd&nbsp;\uc811\ubbf8\uc0ac&nbsp;0\uac1c&nbsp;\uc774\uc0c1<\/li><li>(SL\/)+: \uc678\uad6d\uc5b4 1\uac1c \uc774\uc0c1<\/li><\/ul><\/li><\/ul><\/li><li>71~84\ud589: \uc704 \uc815\uaddc\ud45c\ud604\uc2dd \ud328\ud134\uc73c\ub85c \ucc3e\uc544\uc9c0\ub294 \ub2e8\uc5b4\ub4e4\uc744 \uc5f0\uacb0\ud558\uace0 suffix &#8216;[\ubcf5\ud569\uc5b4]&#8217;\ub97c \ubd99\uc5ec\uc11c \ucd94\ucd9c \ub2e8\uc5b4 \ubaa9\ub85d\uc5d0 \ucd94\uac00\ud55c\ub2e4. \ub098\uc911\uc5d0 \ud45c\uc900 \ub2e8\uc5b4 \uc0ac\uc804 \uc815\uc81c \uc791\uc5c5\uc744 \ud560 \ub54c, \ubcf5\ud569\uc5b4\ub85c \ucd94\ucd9c\ub41c \ub2e8\uc5b4\ub97c \uc2dd\ubcc4\ud558\uae30 \uc704\ud574 \uc77c\ubd80\ub7ec suffix\ub97c \ubd99\uc5ec\ub454\ub2e4.<\/li><li>86~110\ud589: \ucd94\ucd9c\ub41c \ub2e8\uc5b4\ub97c \ucd9c\ucc98\uc640 \ud30c\uc77c \ud615\uc2dd \ub4f1\uc758 \ubd80\uac00 \uc18d\uc131\uc744 \ub354\ud558\uc5ec DataFrame\uc5d0 \ub2f4\ub294\ub2e4.<\/li><li>122\ud589: \ucd94\ucd9c\ub41c \ub2e8\uc5b4 \ubaa9\ub85d\uc774 \ub2f4\uaca8\uc788\ub294 df_result\ub97c \ubc18\ud658\ud55c\ub2e4.&nbsp;<\/li><\/ul>\n\n\n\n<p>\ucc38\uace0\ub85c, \ub2e4\ub978 \ud488\uc0ac \ud328\ud134\uc744 \ucd94\uac00\ub85c \ucd94\ucd9c\ud558\ub824\uba74 65\ud589\uacfc 70\ud589\uc744 \uc218\uc815\ud558\uba74 \ub41c\ub2e4.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"4.5._make_word_cloud_\ud568\uc218\"><span class=\"ez-toc-section\" id=\"45_make_word_cloud_%ED%95%A8%EC%88%98\"><\/span>4.5. make_word_cloud \ud568\uc218<span class=\"ez-toc-section-end\"><\/span><\/h3>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"python\" data-enlighter-theme=\"\" data-enlighter-highlight=\"258-263,269-270\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"245\" data-enlighter-title=\"\" data-enlighter-group=\"\">def make_word_cloud(df_group, now_dt, out_path):\n    \"\"\"\n    \uba85\uc0ac\uc758 \ube48\ub3c4\ub97c \uad6c\ud55c DataFrame\uc73c\ub85c word cloud \uadf8\ub9ac\uae30\n    :param df_group: \uba85\uc0ac \ube48\ub3c4 DataFrame\n    :param now_dt: \ud604\uc7ac \ub0a0\uc9dc \uc2dc\uac01\n    :param out_path: \ucd9c\ub825\uacbd\ub85c\n    :return: None\n    \"\"\"\n    start_time = time.time()\n    print('\\r\\nstart make_word_cloud...')\n    from wordcloud import WordCloud\n    import matplotlib.pyplot as plt\n    # malgun.ttf # NanumSquare.ttf # NanumSquareR.ttf NanumMyeongjo.ttf # NanumBarunpenR.ttf # NanumBarunGothic.ttf\n    wc = WordCloud(font_path='.\\\\font\\\\NanumBarunGothic.ttf',\n                   background_color='white',\n                   max_words=500,\n                   width=1800,\n                   height=1000\n                   )\n\n    # print(df_group.head(10))\n    words = df_group.to_dict()['Freq']\n    # print(words)\n    # words = df_group.T.to_dict('list')\n    wc.generate_from_frequencies(words)\n    wc.to_file('%s\\\\wordcloud_%s.png' % (out_path, now_dt))\n    # plt.axis('off')\n    end_time = time.time()\n    # elapsed_time = end_time - start_time\n    elapsed_time = str(datetime.timedelta(seconds=end_time - start_time))\n    print('make_word_cloud elapsed time: %s' % elapsed_time)<\/pre>\n\n\n\n<p>\uc774 \ud568\uc218\uc5d0\uc11c\ub294 WordCloud package\ub97c \uc0ac\uc6a9\ud55c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-full\"><a href=\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\"><img loading=\"lazy\" decoding=\"async\" width=\"372\" height=\"373\" src=\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\" alt=\"WordCloud \uc608\uc2dc\" class=\"wp-image-11555\" srcset=\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png 372w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-300x300.png 300w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-150x150.png 150w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-12x12.png 12w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-80x80.png 80w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-320x320.png 320w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-24x24.png 24w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-36x36.png 36w, https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151-48x48.png 48w\" sizes=\"auto, (max-width: 372px) 100vw, 372px\" \/><\/a><figcaption>WordCloud \uc608\uc2dc<\/figcaption><\/figure>\n<\/div>\n\n\n<ul class=\"wp-block-list\"><li>258~263\ud589: WordCloud \uac1c\uccb4\ub97c \uc0dd\uc131\ud55c\ub2e4.<ul><li>font \ud3f4\ub354 \ud558\uc704\uc5d0 \uc788\ub294 NanumBarunGothic.ttf(\ub098\ub214\ubc14\ub978\uace0\ub515) \uae00\uaf34 \ud30c\uc77c\uc744 \uc0ac\uc6a9\ud558\uc600\ub2e4. \ub2e4\ub978 \uae00\uaf34\ub85c \ubcc0\uacbd\ud558\ub824\uba74 font \ud3f4\ub354\uc5d0 \uae00\uaf34 \ud30c\uc77c\uc744 \ubcf5\uc0ac\ud558\uace0 \uadf8 \ud30c\uc77c\uba85\uc744 \uc9c0\uc815\ud558\uba74 \ub41c\ub2e4.<\/li><li>background_color, max_words, width, height\ub294 \uc6d0\ud558\ub294 \uac12\uc73c\ub85c \ubcc0\uacbd\ud574\uc11c \uc0ac\uc6a9\ud558\uba74 \ub41c\ub2e4.<\/li><\/ul><\/li><li>266\ud589: DataFrame df_group\uc5d0\uc11c Key\ub294 Index(\ub2e8\uc5b4), Value\ub294 &#8216;Freq'(\ube48\ub3c4)\ub85c \uad6c\uc131\ub41c dictionary words\ub97c \uc0dd\uc131\ud55c\ub2e4.<\/li><li>269\ud589: \ube48\ub3c4\uac00 \ud3ec\ud568\ub41c words\ub85c\ubd80\ud130 WordCloud \uc774\ubbf8\uc9c0\ub97c \uc0dd\uc131\ud55c\ub2e4.<\/li><li>270\ud589: \uc0dd\uc131\ub41c WordCloud \uc774\ubbf8\uc9c0\ub97c \uc800\uc7a5\ud55c\ub2e4.<\/li><\/ul>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity is-style-dots\"\/>\n\n\n\n<p>\uc5ec\uae30\uae4c\uc9c0 \uc18c\uc2a4\ucf54\ub4dc\uc5d0 \ub300\ud55c \uc124\uba85\uc740 \ub9c8\ucce4\ub2e4. \ub2e4\uc74c\uc5d0\ub294 \uc18c\uc2a4\ucf54\ub4dc\uc5d0 \ub300\ud55c \ubd80\uac00\uc124\uba85\uacfc \ud488\uc0ac tagging \uad00\ub828 \ub0b4\uc6a9\uc744 \uc0b4\ud3b4\ubcf4\uaca0\ub2e4.<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<p>&lt;&lt;&nbsp;<strong>\uad00\ub828 \uae00 \ubaa9\ub85d<\/strong>&nbsp;&gt;&gt;<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li><a href=\"https:\/\/prodskill.com\/word-extractor-overview\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(1): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uac1c\uc694<\/a><\/li><li><a href=\"https:\/\/prodskill.com\/word-extractor-config-runtime-environment\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(2): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc2e4\ud589\ud658\uacbd \uad6c\uc131<\/a><\/li><li><a href=\"https:\/\/prodskill.com\/word-extractor-run-and-check-result\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(3): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc2e4\ud589 \ubc29\ubc95\uacfc \uacb0\uacfc \ud655\uc778 \ubc29\ubc95<\/a><\/li><li><a href=\"https:\/\/prodskill.com\/word-extractor-source-code-1\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(4): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(1)<\/a><\/li><li><a href=\"https:\/\/prodskill.com\/word-extractor-source-code-2\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2)<\/a><\/li><li><a href=\"https:\/\/prodskill.com\/word-extractor-additional-information\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(6): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \ubd80\uac00 \uc124\uba85<\/a><\/li><li><a href=\"https:\/\/prodskill.com\/word-extractor-toc\/\">\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc124\uba85\uae00 \uc804\uccb4 \ubaa9\ucc28 , \ub2e4\uc6b4\ub85c\ub4dc<\/a><\/li><\/ul>\n","protected":false},"excerpt":{"rendered":"<p>\u63a5\u4e0a\u4e00\u7bc7\uff0c\u6211\u4eec\u6765\u770b\u4e00\u4e0b\u7528Python\u5b9e\u73b0\u7684\u5206\u8bcd\u5de5\u5177\u7684\u6e90\u7801\u3002\u8fd9\u662f\u4e0a\u4e00\u7bc7\u6587\u7ae0\u7684\u5ef6\u7eed\u3002\u5206\u8bcd\u5de5\u5177\uff08\u56db\uff09\uff1a\u5206\u8bcd\u5de5\u5177\u6e90\u7801\u8bf4\u660e\uff08\u4e00\uff09 4.\u5206\u8bcd\u5de5\u5177\u6e90\u7801 4.3\uff0e get_file_text function Lines 357-365: \u6839\u636e\u6587\u4ef6\u6269\u5c55\u540d\u6267\u884c\u76f8\u5e94\u7684\u51fd\u6570\u5e76\u5c06\u7ed3\u679c\u653e\u5165df_text ...<\/p>","protected":false},"author":1,"featured_media":11555,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[21],"tags":[15,81,83,84,85,86,87],"class_list":["post-11553","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-word-extractor","tag-python","tag-mecab","tag-83","tag-word-extractor","tag-nlp","tag-86","tag-87"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v27.2 - https:\/\/yoast.com\/product\/yoast-seo-wordpress\/ -->\n<title>\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2) - \uc0dd\uc0b0\uc131 Skill<\/title>\n<meta name=\"description\" content=\"Python\uc73c\ub85c \uad6c\ud604\ud55c \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc5d0 \ub300\ud574 \uc0b4\ud3b4\ubcf8\ub2e4. \uc8fc\uc694 \ud568\uc218\uc778 get_file_text \ud568\uc218\uc758 \ud30c\uc77c\uc720\ud615(doc, ppt, txt)\ubcc4\ub85c \uad6c\ud604\ud55c \uc18c\uc2a4\ucf54\ub4dc\uc640 \ud575\uc2ec \ud568\uc218\uc778 get_word_list\ub97c \ud655\uc778\ud560 \uc218 \uc788\ub2e4.\" \/>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2) - \uc0dd\uc0b0\uc131 Skill\" \/>\n<meta property=\"og:description\" content=\"Python\uc73c\ub85c \uad6c\ud604\ud55c \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc5d0 \ub300\ud574 \uc0b4\ud3b4\ubcf8\ub2e4. \uc8fc\uc694 \ud568\uc218\uc778 get_file_text \ud568\uc218\uc758 \ud30c\uc77c\uc720\ud615(doc, ppt, txt)\ubcc4\ub85c \uad6c\ud604\ud55c \uc18c\uc2a4\ucf54\ub4dc\uc640 \ud575\uc2ec \ud568\uc218\uc778 get_word_list\ub97c \ud655\uc778\ud560 \uc218 \uc788\ub2e4.\" \/>\n<meta property=\"og:url\" content=\"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/\" \/>\n<meta property=\"og:site_name\" content=\"\uc0dd\uc0b0\uc131 Skill\" \/>\n<meta property=\"article:published_time\" content=\"2022-09-25T09:32:49+00:00\" \/>\n<meta property=\"article:modified_time\" content=\"2022-10-10T11:58:28+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\" \/>\n\t<meta property=\"og:image:width\" content=\"372\" \/>\n\t<meta property=\"og:image:height\" content=\"373\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/png\" \/>\n<meta name=\"author\" content=\"Zerom\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"Zerom\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"12 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#article\",\"isPartOf\":{\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/\"},\"author\":{\"name\":\"Zerom\",\"@id\":\"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd\"},\"headline\":\"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2)\",\"datePublished\":\"2022-09-25T09:32:49+00:00\",\"dateModified\":\"2022-10-10T11:58:28+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/\"},\"wordCount\":323,\"commentCount\":0,\"publisher\":{\"@id\":\"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd\"},\"image\":{\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage\"},\"thumbnailUrl\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\",\"keywords\":[\"python\",\"MeCab\",\"\ub2e8\uc5b4 \ucd94\ucd9c\",\"word-extractor\",\"nlp\",\"\ud615\ud0dc\uc18c \ubd84\uc11d\uae30\",\"\uc790\uc5f0\uc5b4 \ucc98\ub9ac\"],\"articleSection\":[\"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c\"],\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"CommentAction\",\"name\":\"Comment\",\"target\":[\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#respond\"]}]},{\"@type\":\"WebPage\",\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/\",\"url\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/\",\"name\":\"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2) - \uc0dd\uc0b0\uc131 Skill\",\"isPartOf\":{\"@id\":\"https:\/\/prodskill.com\/ko\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage\"},\"image\":{\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage\"},\"thumbnailUrl\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\",\"datePublished\":\"2022-09-25T09:32:49+00:00\",\"dateModified\":\"2022-10-10T11:58:28+00:00\",\"description\":\"Python\uc73c\ub85c \uad6c\ud604\ud55c \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc5d0 \ub300\ud574 \uc0b4\ud3b4\ubcf8\ub2e4. \uc8fc\uc694 \ud568\uc218\uc778 get_file_text \ud568\uc218\uc758 \ud30c\uc77c\uc720\ud615(doc, ppt, txt)\ubcc4\ub85c \uad6c\ud604\ud55c \uc18c\uc2a4\ucf54\ub4dc\uc640 \ud575\uc2ec \ud568\uc218\uc778 get_word_list\ub97c \ud655\uc778\ud560 \uc218 \uc788\ub2e4.\",\"breadcrumb\":{\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/prodskill.com\/word-extractor-source-code-2\/\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage\",\"url\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\",\"contentUrl\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png\",\"width\":372,\"height\":373},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/prodskill.com\/word-extractor-source-code-2\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\/\/prodskill.com\/ko\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2)\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/prodskill.com\/ko\/#website\",\"url\":\"https:\/\/prodskill.com\/ko\/\",\"name\":\"\uc0dd\uc0b0\uc131 Skill\",\"description\":\"Meta Thinking, Meta Working\",\"publisher\":{\"@id\":\"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/prodskill.com\/ko\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"zh-Hans\"},{\"@type\":[\"Person\",\"Organization\"],\"@id\":\"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd\",\"name\":\"Zerom\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png\",\"url\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png\",\"contentUrl\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png\",\"width\":512,\"height\":512,\"caption\":\"Zerom\"},\"logo\":{\"@id\":\"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png\"},\"url\":\"https:\/\/prodskill.com\/zh\/author\/proda\/\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"\u5355\u8bcd\u63d0\u53d6\u5de5\u5177\uff085\uff09\uff1a\u5355\u8bcd\u63d0\u53d6\u5de5\u5177\u6e90\u4ee3\u7801\u8bf4\u660e\uff082\uff09 - \u751f\u4ea7\u529b\u6280\u80fd","description":"\u6211\u4eec\u6765\u770b\u4e00\u4e0b\u7528 Python \u5b9e\u73b0\u7684\u5355\u8bcd\u63d0\u53d6\u5de5\u5177\u7684\u6e90\u4ee3\u7801\u3002\u60a8\u53ef\u4ee5\u770b\u5230\u6309\u6587\u4ef6\u7c7b\u578b\uff08doc\u3001ppt\u3001txt\uff09\u5b9e\u73b0\u7684\u4e3b\u51fd\u6570 get_file_text \u7684\u6e90\u4ee3\u7801\uff0c\u4ee5\u53ca\u6838\u5fc3\u51fd\u6570 get_word_list \u7684\u6e90\u4ee3\u7801\u3002","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/","og_locale":"zh_CN","og_type":"article","og_title":"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2) - \uc0dd\uc0b0\uc131 Skill","og_description":"Python\uc73c\ub85c \uad6c\ud604\ud55c \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc5d0 \ub300\ud574 \uc0b4\ud3b4\ubcf8\ub2e4. \uc8fc\uc694 \ud568\uc218\uc778 get_file_text \ud568\uc218\uc758 \ud30c\uc77c\uc720\ud615(doc, ppt, txt)\ubcc4\ub85c \uad6c\ud604\ud55c \uc18c\uc2a4\ucf54\ub4dc\uc640 \ud575\uc2ec \ud568\uc218\uc778 get_word_list\ub97c \ud655\uc778\ud560 \uc218 \uc788\ub2e4.","og_url":"https:\/\/prodskill.com\/zh\/word-extractor-source-code-2\/","og_site_name":"\uc0dd\uc0b0\uc131 Skill","article_published_time":"2022-09-25T09:32:49+00:00","article_modified_time":"2022-10-10T11:58:28+00:00","og_image":[{"width":372,"height":373,"url":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png","type":"image\/png"}],"author":"Zerom","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"Zerom","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"12 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#article","isPartOf":{"@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/"},"author":{"name":"Zerom","@id":"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd"},"headline":"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2)","datePublished":"2022-09-25T09:32:49+00:00","dateModified":"2022-10-10T11:58:28+00:00","mainEntityOfPage":{"@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/"},"wordCount":323,"commentCount":0,"publisher":{"@id":"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd"},"image":{"@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage"},"thumbnailUrl":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png","keywords":["python","MeCab","\ub2e8\uc5b4 \ucd94\ucd9c","word-extractor","nlp","\ud615\ud0dc\uc18c \ubd84\uc11d\uae30","\uc790\uc5f0\uc5b4 \ucc98\ub9ac"],"articleSection":["\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c"],"inLanguage":"zh-Hans","potentialAction":[{"@type":"CommentAction","name":"Comment","target":["https:\/\/prodskill.com\/word-extractor-source-code-2\/#respond"]}]},{"@type":"WebPage","@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/","url":"https:\/\/prodskill.com\/word-extractor-source-code-2\/","name":"\u5355\u8bcd\u63d0\u53d6\u5de5\u5177\uff085\uff09\uff1a\u5355\u8bcd\u63d0\u53d6\u5de5\u5177\u6e90\u4ee3\u7801\u8bf4\u660e\uff082\uff09 - \u751f\u4ea7\u529b\u6280\u80fd","isPartOf":{"@id":"https:\/\/prodskill.com\/ko\/#website"},"primaryImageOfPage":{"@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage"},"image":{"@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage"},"thumbnailUrl":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png","datePublished":"2022-09-25T09:32:49+00:00","dateModified":"2022-10-10T11:58:28+00:00","description":"\u6211\u4eec\u6765\u770b\u4e00\u4e0b\u7528 Python \u5b9e\u73b0\u7684\u5355\u8bcd\u63d0\u53d6\u5de5\u5177\u7684\u6e90\u4ee3\u7801\u3002\u60a8\u53ef\u4ee5\u770b\u5230\u6309\u6587\u4ef6\u7c7b\u578b\uff08doc\u3001ppt\u3001txt\uff09\u5b9e\u73b0\u7684\u4e3b\u51fd\u6570 get_file_text \u7684\u6e90\u4ee3\u7801\uff0c\u4ee5\u53ca\u6838\u5fc3\u51fd\u6570 get_word_list \u7684\u6e90\u4ee3\u7801\u3002","breadcrumb":{"@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/prodskill.com\/word-extractor-source-code-2\/"]}]},{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#primaryimage","url":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png","contentUrl":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/image-151.png","width":372,"height":373},{"@type":"BreadcrumbList","@id":"https:\/\/prodskill.com\/word-extractor-source-code-2\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/prodskill.com\/ko\/"},{"@type":"ListItem","position":2,"name":"\ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c(5): \ub2e8\uc5b4 \ucd94\ucd9c \ub3c4\uad6c \uc18c\uc2a4\ucf54\ub4dc \uc124\uba85(2)"}]},{"@type":"WebSite","@id":"https:\/\/prodskill.com\/ko\/#website","url":"https:\/\/prodskill.com\/ko\/","name":"\u751f\u4ea7\u529b\u6280\u80fd","description":"\u5143\u601d\u7ef4\uff0c\u5143\u5de5\u4f5c","publisher":{"@id":"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/prodskill.com\/ko\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"zh-Hans"},{"@type":["Person","Organization"],"@id":"https:\/\/prodskill.com\/ko\/#\/schema\/person\/bbad0870c78008c82edbe0960fe768bd","name":"\u6cfd\u7f57\u59c6","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png","url":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png","contentUrl":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png","width":512,"height":512,"caption":"Zerom"},"logo":{"@id":"https:\/\/prodskill.com\/wp-content\/uploads\/2022\/09\/productivity_clockgear.png"},"url":"https:\/\/prodskill.com\/zh\/author\/proda\/"}]}},"_links":{"self":[{"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/posts\/11553","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/comments?post=11553"}],"version-history":[{"count":0,"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/posts\/11553\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/media\/11555"}],"wp:attachment":[{"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/media?parent=11553"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/categories?post=11553"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/prodskill.com\/zh\/wp-json\/wp\/v2\/tags?post=11553"}],"curies":[{"name":"\u53ef\u6e7f\u6027\u7c89\u5242","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}