Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

이슈해결: passportKey 및 tree parsing #54

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 82 additions & 42 deletions hanspell/spell_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
"""
Python용 한글 맞춤법 검사 모듈
"""

import re
import requests
import json
import time
import sys
from collections import OrderedDict
import xml.etree.ElementTree as ET
from lxml import etree

from . import __version__
from .response import Checked
Expand All @@ -19,12 +19,53 @@
PY3 = sys.version_info[0] == 3


def get_passport_key():
"""네이버에서 '네이버 맞춤법 검사기' 페이지에서 passportKey를 획득

- 네이버에서 '네이버 맞춤법 검사기'를 띄운 후
html에서 passportKey를 검색하면 값을 찾을 수 있다.

- 찾은 값을 spell_checker.py 48 line에 적용한다.
"""

url = "https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=네이버+맞춤법+검사기"
res = requests.get(url)

html_text = res.text

match = re.search(r'passportKey=([^&"}]+)', html_text)
if match:
passport_key = match.group(1)
return passport_key
else:
return False


def fix_spell_checker_py_code(file_path, passportKey):
"""획득한 passportkey를 spell_checker.py파일에 적용"""

pattern = r"'passportKey': '.*'"

with open(file_path, "r", encoding="utf-8") as input_file:
content = input_file.read()
modified_content = re.sub(pattern, f"'passportKey': '{passportKey}'", content)

with open(file_path, "w", encoding="utf-8") as output_file:
output_file.write(modified_content)

return


passport_key = get_passport_key()


def _remove_tags(text):
text = u'<content>{}</content>'.format(text).replace('<br>','')
text = "<content>{}</content>".format(text).replace("<br>", "")
if not PY3:
text = text.encode('utf-8')
text = text.encode("utf-8")

result = ''.join(ET.fromstring(text).itertext())
parser = etree.XMLParser(recover=True)
result = "".join(etree.fromstring(text, parser=parser).itertext())

return result

Expand All @@ -44,70 +85,69 @@ def check(text):
if len(text) > 500:
return Checked(result=False)

payload = {
'color_blindness': '0',
'q': text
}
payload = {"passportKey": passport_key, "color_blindness": "0", "q": text}

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'referer': 'https://search.naver.com/',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"referer": "https://search.naver.com/",
}

start_time = time.time()
r = _agent.get(base_url, params=payload, headers=headers)
passed_time = time.time() - start_time

data = json.loads(r.text)
html = data['message']['result']['html']
html = data["message"]["result"]["html"]
result = {
'result': True,
'original': text,
'checked': _remove_tags(html),
'errors': data['message']['result']['errata_count'],
'time': passed_time,
'words': OrderedDict(),
"result": True,
"original": text,
"checked": _remove_tags(html),
"errors": data["message"]["result"]["errata_count"],
"time": passed_time,
"words": OrderedDict(),
}

# 띄어쓰기로 구분하기 위해 태그는 일단 보기 쉽게 바꿔둠.
# ElementTree의 iter()를 써서 더 좋게 할 수 있는 방법이 있지만
# 이 짧은 코드에 굳이 그렇게 할 필요성이 없으므로 일단 문자열을 치환하는 방법으로 작성.
html = html.replace('<em class=\'green_text\'>', '<green>') \
.replace('<em class=\'red_text\'>', '<red>') \
.replace('<em class=\'violet_text\'>', '<violet>') \
.replace('<em class=\'blue_text\'>', '<blue>') \
.replace('</em>', '<end>')
items = html.split(' ')
html = (
html.replace("<em class='green_text'>", "<green>")
.replace("<em class='red_text'>", "<red>")
.replace("<em class='violet_text'>", "<violet>")
.replace("<em class='blue_text'>", "<blue>")
.replace("</em>", "<end>")
)
items = html.split(" ")
words = []
tmp = ''
tmp = ""
for word in items:
if tmp == '' and word[:1] == '<':
pos = word.find('>') + 1
if tmp == "" and word[:1] == "<":
pos = word.find(">") + 1
tmp = word[:pos]
elif tmp != '':
word = u'{}{}'.format(tmp, word)
if word[-5:] == '<end>':
word = word.replace('<end>', '')
tmp = ''
elif tmp != "":
word = "{}{}".format(tmp, word)

if word[-5:] == "<end>":
word = word.replace("<end>", "")
tmp = ""

words.append(word)

for word in words:
check_result = CheckResult.PASSED
if word[:5] == '<red>':
if word[:5] == "<red>":
check_result = CheckResult.WRONG_SPELLING
word = word.replace('<red>', '')
elif word[:7] == '<green>':
word = word.replace("<red>", "")
elif word[:7] == "<green>":
check_result = CheckResult.WRONG_SPACING
word = word.replace('<green>', '')
elif word[:8] == '<violet>':
word = word.replace("<green>", "")
elif word[:8] == "<violet>":
check_result = CheckResult.AMBIGUOUS
word = word.replace('<violet>', '')
elif word[:6] == '<blue>':
word = word.replace("<violet>", "")
elif word[:6] == "<blue>":
check_result = CheckResult.STATISTICAL_CORRECTION
word = word.replace('<blue>', '')
result['words'][word] = check_result
word = word.replace("<blue>", "")
result["words"][word] = check_result

result = Checked(**result)

Expand Down