{"id":632,"date":"2016-11-10T19:02:38","date_gmt":"2016-11-10T10:02:38","guid":{"rendered":"https:\/\/www.dogrow.net\/python\/?p=632"},"modified":"2019-10-19T11:43:06","modified_gmt":"2019-10-19T02:43:06","slug":"blog79","status":"publish","type":"post","link":"https:\/\/www.dogrow.net\/python\/blog79\/","title":{"rendered":"(79) \u306a\u3093\u3061\u3083\u3063\u3066Google\u691c\u7d22"},"content":{"rendered":"<p>Google\u691c\u7d22\uff06\u7d50\u679c\u8868\u793a\u3092Python\u7d4c\u7531\u3067\u884c\u3046\u3002<br \/>\n\u30a6\u30a7\u30d6\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u306e\u57fa\u790e\u306b\u306a\u308b\u306e\u304b\u3082(\uff1f)<\/p>\n<p>\u5b9f\u884c\u30b5\u30f3\u30d7\u30eb\u306f\u3053\u3061\u3089\u3002<br \/>\n<a href=\"https:\/\/www.dogrow.net\/python\/sample\/0078\/\" target=_blank rel=\"noopener noreferrer\">https:\/\/www.dogrow.net\/python\/sample\/0078\/<\/a><\/p>\n<h3 class=\"my_h\">(1\/2) index.cgi<\/h3>\n<p>1\u30da\u30fc\u30b8\u76ee\u3057\u304b\u8868\u793a\u3067\u304d\u3066\u3044\u306a\u3044&#8230;<br \/>\n\u5185\u90e8\u3067 start=10,20,30 \u306a\u3069\u3068\u5207\u308a\u66ff\u3048\u3066\u60c5\u5831\u53ce\u96c6\u3092\u7e70\u308a\u8fd4\u305b\u3070\u8907\u6570\u30da\u30fc\u30b8\u5206\u3092\u8868\u793a\u3067\u304d\u308b\u3051\u3069\u3001\u307e\u305a\u306f\u57fa\u672c\u5f62\u306e\u307f\u3092\u4f5c\u3063\u3066\u307f\u308b\u3002<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\n#!\/usr\/local\/bin\/python\r\n# -*- coding: utf-8 -*-\r\n\r\nprint 'Content-type: text\/html'\r\n\r\nimport cgi\r\nimport myparser\r\nimport re\r\n\r\nprint &quot;&quot;&quot;\r\n&lt;!DOCTYPE html&gt;\r\n&lt;html&gt;\r\n&lt;head&gt;\r\n&lt;meta http-equiv=&quot;Content-Type&quot; content=&quot;text\/html; charset=utf-8&quot; \/&gt;\r\n&lt;title&gt;\u30b5\u30f3\u30d7\u30eb&lt;\/title&gt;\r\n&lt;\/head&gt;\r\n&lt;body&gt;\r\n&quot;&quot;&quot;\r\n\r\nform = cgi.FieldStorage()\r\nif form.has_key('word'):\r\n\tword = re.sub(&quot;&#x5B; |\u3000]+&quot;,&quot;+&quot;,form&#x5B;'word'].value)\r\n\turl = 'https:\/\/www.google.co.jp\/search?q=' + word\r\n\tlinkItem = myparser.parse(url)\r\n\tfor item in linkItem:\r\n\t\ttitle = item.get_title()\r\n\t\tsite  = item.get_ref()\r\n\t\tif len(title) &amp;lt;= 0:\r\n\t\t\ttitle = '***'\r\n\t\tif re.match('http', site):\r\n\t\t\thref = site\r\n\t\telse:\r\n\t\t\thref = 'https:\/\/www.google.com' + site\r\n\t\tprint '&amp;lt;a href=\\&quot;%s\\&quot;&amp;gt;%s&amp;lt;\/a&amp;gt;&amp;lt;hr \/&amp;gt;' % (href, title)\r\n\r\nprint &quot;&quot;&quot;\r\n&lt;form enctype=&quot;multipart\/form-data&quot; action=&quot;.\/&quot; method=post&gt;\r\nGoogle\u691c\u7d22\uff1a&lt;input type=text name=word style=&quot;width:400px&quot;&gt;\r\n&lt;input type=submit value=&quot;\u5b9f\u884c&quot;&gt;\r\n&lt;\/form&gt;\r\n&lt;\/body&gt;\r\n&lt;\/html&gt;\r\n&quot;&quot;&quot;\r\n<\/pre>\n<h3 class=\"my_h\">(2\/2) myparser.py<\/h3>\n<p>\u5f37\u5f15\u306a\u3068\u3053\u308d\u3082\u3042\u308b\u306e\u3067\u3001\u5f8c\u3067\u52c9\u5f37\u3057\u76f4\u3057\u3066\u4fee\u6b63\u3059\u308b\u304b\u3082&#8230;<br \/>\n\u6307\u5b9aWEB\u30da\u30fc\u30b8\u306e\u30c7\u30fc\u30bf\u53d6\u5f97\u306b\u306f <span class=mydp>urllib2<\/span> \u30e2\u30b8\u30e5\u30fc\u30eb\u306eHTTP\u30e9\u30c3\u30d1\u30fc\u6a5f\u80fd\u3092\u4f7f\u7528\u3059\u308b\u3002<br \/>\nHTML\u306e\u69cb\u6587\u89e3\u6790\u306b\u306f <span class=mydp>HTMLParser<\/span> \u30e2\u30b8\u30e5\u30fc\u30eb\u3092\u4f7f\u7528\u3059\u308b\u3002\uff08\u4eca\u56de\u306f\u697d\u3067\u304d\u308b <span class=mydp>lxml<\/span> \u3092\u4f7f\u308f\u306a\u3044\uff09<br \/>\nPython\u30d7\u30ed\u30b0\u30e9\u30e0\u3067\u6587\u5b57\u5217\u3092\u51e6\u7406\u3059\u308b\u9593\u306f unicode \u3067\u6271\u3046\u3002<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\n#!~\/usr\/local\/bin\/python\r\n# -*- coding: utf-8 -*-\r\nimport chardet\r\nimport urllib2\r\nfrom HTMLParser import HTMLParser\r\n\r\nclass MyLinkItem:\r\n    def __init__(self):\r\n        self.m_ref = ''\r\n        self.m_title = ''\r\n    def set_ref(self, val):\r\n        self.m_ref = val\r\n    def set_title(self, val):\r\n        self.m_title = val\r\n    def get_ref(self):\r\n        return self.m_ref.encode('utf-8')\r\n    def get_title(self):\r\n        return self.m_title.encode('utf-8')\r\n\r\nclass MyParser(HTMLParser):\r\n    def __init__(self):\r\n        HTMLParser.__init__(self)\r\n        self.m_linkItem = &#x5B;]\r\n        self.proc_atag = False\r\n        self.title = ''\r\n    def handle_starttag(self, tagname, attribute):\r\n        if tagname.lower() == 'a':\r\n            for i in attribute:\r\n                if i&#x5B;0].lower() == 'href':\r\n                    if re.match('\\\/url\\?q=http', i&#x5B;1]):\r\n                        if not re.search('webcache\\.googleusercontent', i&#x5B;1]):\r\n                            self.proc_atag = True\r\n                            self.title = ''\r\n                            linkItem = MyLinkItem()\r\n                            linkItem.set_ref(i&#x5B;1])\r\n                            self.m_linkItem.append(linkItem)\r\n    def handle_endtag(self, tagname):\r\n        if tagname.lower() == 'a':\r\n            if self.proc_atag == True:\r\n                self.m_linkItem&#x5B;-1].set_title(self.title)\r\n                self.proc_atag = False\r\n    def handle_data(self, data):\r\n        if self.proc_atag == True:\r\n            self.title = self.title + data\r\n    def get_linkItem(self):\r\n        return self.m_linkItem\r\n\r\ndef parse(url):\r\n    # load html\r\n    opener = urllib2.build_opener()\r\n    opener.addheaders = &#x5B;('User-agent', 'Mozilla\/5.0')]\r\n    html = opener.open(url)\r\n    data = html.read()\r\n    html.close()\r\n    rr = chardet.detect(data)           # \u6587\u5b57\u30b3\u30fc\u30c9\u3092\u5224\u5b9a\r\n    data = data.decode(rr&#x5B;'encoding'])  # unicode\u306b\u5909\u63db\r\n    # parse\r\n    parser = MyParser()\r\n    parser.feed(data)\r\n    linkItem = parser.get_linkItem()\r\n    parser.close()\r\n    return linkItem\r\n<\/pre>\n<hr class=\"my_hr_bottom\">\n","protected":false},"excerpt":{"rendered":"<p>Google\u691c\u7d22\uff06\u7d50\u679c\u8868\u793a\u3092Python\u7d4c\u7531\u3067\u884c\u3046\u3002 \u30a6\u30a7\u30d6\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u306e\u57fa\u790e\u306b\u306a\u308b\u306e\u304b\u3082(\uff1f) \u5b9f\u884c\u30b5\u30f3\u30d7\u30eb\u306f\u3053\u3061\u3089\u3002 https:\/\/www.dogrow.net\/python\/sample\/0078\/ (1\/2)\u2026 <span class=\"read-more\"><a href=\"https:\/\/www.dogrow.net\/python\/blog79\/\">\u7d9a\u304d\u3092\u8aad\u3080 &raquo;<\/a><\/span><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[26],"tags":[],"class_list":["post-632","post","type-post","status-publish","format-standard","hentry","category-web"],"views":3488,"amp_enabled":true,"_links":{"self":[{"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/posts\/632","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/comments?post=632"}],"version-history":[{"count":40,"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/posts\/632\/revisions"}],"predecessor-version":[{"id":2711,"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/posts\/632\/revisions\/2711"}],"wp:attachment":[{"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/media?parent=632"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/categories?post=632"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.dogrow.net\/python\/wp-json\/wp\/v2\/tags?post=632"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}