{"id":12388,"date":"2017-08-15T11:00:24","date_gmt":"2017-08-15T02:00:24","guid":{"rendered":"http:\/\/sitest.jp\/blog\/?p=12388"},"modified":"2017-08-16T14:44:06","modified_gmt":"2017-08-16T05:44:06","slug":"%e3%80%90python3%e7%b3%bb%e5%af%be%e5%bf%9c%e3%80%91-pdfminer3k%e3%81%a7pdf%e3%83%95%e3%82%a1%e3%82%a4%e3%83%ab%e3%81%8b%e3%82%89%e3%83%87%e3%83%bc%e3%82%bf%e6%8a%bd%e5%87%ba%e3%81%97%e3%81%a6","status":"publish","type":"post","link":"https:\/\/sitest.jp\/blog\/?p=12388","title":{"rendered":"\u3010Python3\u7cfb\u5bfe\u5fdc\u3011 PDFMiner3k\u3067PDF\u30d5\u30a1\u30a4\u30eb\u304b\u3089\u30c7\u30fc\u30bf\u62bd\u51fa\u3057\u3066\u5206\u6790\u3057\u3088\u3046\uff01(\u5f8c\u7de8)"},"content":{"rendered":"<p>\u3055\u3066\u3001<a href=\"http:\/\/sitest.jp\/blog\/?p=12170\" target=\"_blank\">\u524d\u56de<\/a>\u30aa\u30fc\u30d7\u30f3\u30c7\u30fc\u30bf\u306e PDF \u304b\u3089\u30c6\u30ad\u30b9\u30c8\u5f62\u5f0f\u306e\u30c7\u30fc\u30bf\u3092\u62bd\u51fa\u3044\u305f\u3057\u307e\u3057\u305f\u3002<\/p>\n<p>\u3057\u304b\u3057\u3001\u3082\u3068\u3082\u3068\u8868\u5f62\u5f0f\u3060\u3063\u305f PDF \u304c\u3001\u3059\u3079\u3066\u306e\u30c7\u30fc\u30bf\u304c\u4e00\u5217\u306b\u4e26\u3093\u3060\u3001\u3068\u3066\u3082\u60b2\u60e8\u306a\u5f62\u5f0f\u306b\u306a\u3063\u3066\u3057\u307e\u3044\u307e\u3057\u305f\u3002\u3053\u308c\u3092\u3069\u3046\u306b\u304b\u8a08\u7b97\u3067\u304d\u308b\u5f62\u306b\u6301\u3063\u3066\u3044\u304d\u305f\u3044\u306e\u3067\u3059\u304c\u3001\u3058\u3064\u306f\u3001\u30e9\u30a4\u30d6\u30e9\u30ea\u3092\u3064\u304b\u3063\u3066\u3061\u3087\u3063\u3068\u3044\u3058\u308c\u3070\u3069\u3046\u306b\u304b\u306a\u308b\u3002\u3068\u3044\u3046\u3082\u306e\u3067\u306f\u3042\u308a\u307e\u305b\u3093\u3002<br \/>\n<!--more--><br \/>\n\u524d\u56de\u8ff0\u3079\u305f\u3088\u3046\u306b PDF \u306f\u6587\u66f8\u306e\u300c\u69cb\u9020\u300d\u306b\u95a2\u3059\u308b\u30c7\u30fc\u30bf\u3092\u6301\u305f\u306a\u3044\u305f\u3081\u3001\u305f\u3068\u3048\u8868\u306e\u3088\u3046\u306b\u898b\u3048\u3066\u3044\u305f\u3068\u3057\u3066\u3082\u3001\u5168\u90e8\u5358\u306a\u308b\u6587\u5b57\u306e\u4e26\u3073\u3067\u3059\u3002\u8868\u306e\u3088\u3046\u306a\u30c7\u30fc\u30bf\u3068\u3057\u3066\u6271\u3046\u305f\u3081\u306b\u306f<br \/>\n\u305d\u308c\u305e\u308c\u306e\u300c\u6587\u5b57\u300d\u304c\u300c\u8868\u300d\u306b\u304a\u3044\u3066\u3069\u3046\u3044\u3046\u610f\u5473\u3092\u6301\u3064\u306e\u304b\u3001\u3053\u3061\u3089\u304c\u6307\u5b9a\u3057\u3066\u3042\u3052\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002<\/p>\n<p>\u3044\u308d\u3044\u308d\u306a\u8abf\u6574\u3092\u884c\u3044\u306a\u304c\u3089\u3001\u4e0b\u8a18\u306e\u3088\u3046\u306a\u30bd\u30fc\u30b9\u3092\u66f8\u304d\u307e\u3057\u305f\u3002<\/p>\n<pre><code>from pprint import pprint\nimport re\nimport numpy as np\n\ndef birth_list(converted_pdf_file):\n    birth_data=[]\n    birth_row =[]\n    for line in open(converted_pdf_file, 'r'):\n        #\u6539\u884c\u306e\u307f\u306e\u884c\u304c\u5b58\u5728\u3057\u305f\u3089\u3001\u5225\u306e\u9805\u76ee\u3068\u8a8d\u8b58\u3059\u308b\n        if line == \"n\":\n            if len(birth_row) &gt; 3:\n                birth_data.append(birth_row)\n            birth_row=[]\n        #\u5e74\u306f\u6587\u5b57\u5217\u306e\u307e\u307e\u3002\u6700\u521d\u3060\u3051\u5e74\u53f7\u304c\u66f8\u304b\u308c\u3066\u3044\u308b\u5834\u5408\u304c\u3042\u308b\u306e\u3067\u53d6\u308a\u9664\u304f\u5fc5\u8981\u304c\u3042\u308b\n        elif re.match(\".+\u5e74$\", line):\n            if not re.match(\"^[0-9]+\u5e74$\", line):\n                birth_row.append((line.split())[0])\n        elif re.match(\"^[1|2][0-9]{3}n$\", line):\n            line = line.rstrip('n')\n            birth_row.append(line)\n        #\u6570\u5b57\u306f int\u307e\u305f\u306f float \u5f62\u5f0f\u306b\u5909\u63db\n        else:\n            line = line.replace(',', '')\n            try:\n                birth_row.append(int(line))\n            except ValueError:\n                try:\n                    birth_row.append(float(line))\n                except ValueError:\n                    pass\n    return birth_data\n\n\nbirth_data = birth_list('converted_data.txt')\nbirth_data = np.array(birth_data)\n#\u51fa\u529b\nprint(np.transpose(birth_data))\n\n<\/code><\/pre>\n<p>\u307e\u305a\u3001\u5b8c\u5168\u306b\u5d29\u308c\u3066\u3057\u307e\u3063\u305f\u9805\u76ee\u540d\u306e\u90e8\u5206\u3092\u30c6\u30ad\u30b9\u30c8\u304b\u3089\u629c\u304d\u51fa\u3059\u306e\u306f\u3001\u3059\u3063\u3071\u308a\u3042\u304d\u3089\u3081\u307e\u3059\u3002<\/p>\n<p>\u62bd\u51fa\u3055\u308c\u305f\u30c6\u30ad\u30b9\u30c8\u30c7\u30fc\u30bf\u3092\u307f\u3066\u3044\u308b\u3068\u30c7\u30fc\u30bf\u5217\u3068\u30c7\u30fc\u30bf\u5217\u306e\u9593\u306f\u3001\u898f\u5247\u7684\u306b1\u884c1\u884c\u7a7a\u3044\u3066\u3044\u308b\u306e\u3067\u305d\u308c\u3092\u76ee\u5370\u306b\u9805\u76ee\uff11\u5217\u305a\u3064\u914d\u5217\u306b\u5165\u308c\u3066\u3044\u304d\u307e\u3059\u3002<\/p>\n<p>\u5e74\u306f\u6587\u5b57\u5217\u3001\u4ed6\u306e\u5024\u306f\u6570\u5b57\u3068\u3057\u3066\u3001\u578b\u5909\u63db\u3057\u3066\u3044\u304d\u307e\u3059\u3002\u305d\u3046\u3084\u3063\u3066\u3001\u9805\u76ee\u3054\u3068\u306b\u3067\u304d\u305f\u914d\u5217\u3092\u3001\u66f4\u306b\u5225\u306e\u7a7a\u914d\u5217\u306b\u5165\u308c\u3066\u3044\u304d\u3001\u8868\u306e\u3088\u3046\u306a2\u91cd\u914d\u5217\u3092\u4f5c\u308a\u307e\u3059\u3002<\/p>\n<p>\u305f\u3060\u3053\u306e\u307e\u307e\u3067\u306f\u3001\u30c7\u30fc\u30bf\u306e\u300c\u30bf\u30c6\u300d\u3068\u300c\u30e8\u30b3\u300d\u304c\u9006\u306a\u306e\u3067\u3001\uff12\u91cd\u914d\u5217\u3092\u6570\u5024\u8a08\u7b97\u30e9\u30a4\u30d6\u30e9\u30ea\u3001Numpy \u306e Array \u5f62\u5f0f\u306b\u5909\u63db\u3057\u3001transpose\u30e1\u30bd\u30c3\u30c9\u3067\u300c\u56de\u8ee2\u300d\u3055\u305b\u3001\u300c\u30bf\u30c6\u300d\u3068\u300c\u30e8\u30b3\u300d\u3092\u3044\u308c\u304b\u3048\u307e\u3059\u3002<\/p>\n<p>\u305d\u3046\u3084\u3063\u3066\u3067\u304d\u305f\u30c7\u30fc\u30bf\u306f\u4e0b\u8a18\u306e\u3088\u3046\u306b\u306a\u308a\u307e\u3059\u3002<\/p>\n<p>\u51fa\u529b<\/p>\n<pre><code><br \/>[['1960' '51632' '17.1' '95012' '17.3' '1606041' '17.2']\n ['1961' '53351' '17.3' '101318' '17.7' '1589372' '16.9']\n ['1962' '57682' '18.3' '114469' '19.2' '1618616' '17.0']\n ['1963' '58000' '18.1' '121396' '19.6' '1659521' '17.3']\n ['1964' '59035' '18.4' '131994' '20.6' '1716761' '17.7']\n ['1965' '62003' '19.6' '147249' '22.1' '1823697' '18.6']\n ['1966' '44869' '14.3' '111099' '16.3' '1360974' '13.7']\n ['1967' '59298' '19.6' '158482' '23.2' '1935647' '19.4']\n ['1968' '56758' '19.1' '159740' '22.7' '1871839' '18.6']\n ['1969' '54525' '18.5' '163995' '22.7' '1889815' '18.5']\n ['1970' '53165' '17.8' '169880' '22.8' '1934239' '18.8']\n ['1971' '52003' '17.7' '175049' '23.0' '2000973' '19.2']\n ['1972' '50068' '17.3' '176094' '22.6' '2038682' '19.3']\n ['1973' '47647' '16.8' '174330' '22.1' '2091983' '19.4']\n ['1974' '43839' '15.6' '165560' '20.7' '2029989' '18.6']\n ['1975' '39197' '14.1' '150653' '18.6' '1901440' '17.1']\n ['1976' '35951' '13.1' '140032' '17.1' '1832617' '16.3']\n ['1977' '33946' '12.5' '132172' '16.1' '1755100' '15.5']\n ['1978' '31775' '11.8' '124711' '15.1' '1708643' '14.9']\n ['1979' '30192' '11.3' '115990' '13.9' '1642580' '14.2']\n ['1980' '28919' '11.0' '111956' '13.5' '1576889' '13.6']\n ['1981' '28415' '10.8' '106456' '12.8' '1529455' '13.0']\n ['1982' '28406' '10.8' '106098' '12.7' '1515392' '12.8']\n ['1983' '28564' '10.9' '105432' '12.5' '1508687' '12.7']\n ['1984' '28607' '10.9' '103595' '12.2' '1489780' '12.5']\n ['1985' '28369' '10.7' '100328' '11.8' '1431577' '11.9']\n ['1986' '28125' '10.6' '97693' '11.4' '1382946' '11.4']\n ['1987' '27600' '10.4' '94828' '11.1' '1346658' '11.1']\n ['1988' '27446' '10.4' '93315' '10.9' '1314006' '10.8']\n ['1989' '26077' '9.9' '88468' '10.3' '1246802' '10.2']\n ['1990' '25065' '9.6' '86840' '10.1' '1221585' '10.0']\n ['1991' '25297' '9.7' '86795' '10.1' '1223245' '9.9']\n ['1992' '24810' '9.5' '86658' '10.1' '1208989' '9.8']\n ['1993' '24008' '9.3' '84151' '9.8' '1188282' '9.6']\n ['1994' '24124' '9.8' '88419' '10.4' '1238328' '10.0']\n ['1995' '24467' '9.8' '86076' '10.0' '1187064' '9.6']\n ['1996' '24862' '9.6' '89291' '10.3' '1206555' '9.7']\n ['1997' '24994' '10.1' '89043' '10.3' '1191665' '9.5']\n ['1998' '24898' '9.6' '90324' '10.5' '1203147' '9.6']\n ['1999' '24203' '9.3' '88385' '10.2' '1177669' '9.4']\n ['2000' '24136' '9.6' '88163' '10.2' '1190547' '9.5']\n ['2001' '24132' '9.2' '86000' '9.9' '1170662' '9.3']\n ['2002' '23635' '9.0' '83883' '9.7' '1153855' '9.2']\n ['2003' '23228' '8.8' '81001' '9.4' '1123610' '8.9']\n ['2004' '22794' '8.7' '79719' '9.2' '1110721' '8.8']\n ['2005' '21913' '8.3' '76111' '8.8' '1062530' '8.4']\n ['2006' '22949' '8.7' '77641' '9.0' '1092674' '8.7']\n ['2007' '22892' '8.7' '76914' '8.9' '1089818' '8.6']\n ['2008' '23290' '8.8' '77400' '9.0' '1091156' '8.7']\n ['2009' '22738' '8.5' '75248' '8.7' '1070035' '8.5']\n ['2010' '23061' '8.7' '75080' '8.6' '1071304' '8.5']\n ['2011' '22992' '8.6' '73919' '8.5' '1050806' '8.3']\n ['2012' '22763' '8.5' '73012' '8.4' '1037231' '8.2']\n ['2013' '22626' '8.4' '72054' '8.3' '1029816' '8.2']\n ['2014' '21940' '8.2' '69968' '8.1' '1003539' '8.0']\n ['2015' '22351' '8.6' '70596' '8.1' '1005677' '8.0']]\n<\/code><\/pre>\n<p>\u3082\u3046\u5c11\u3057\u30b9\u30de\u30fc\u30c8\u306b\u3067\u304d\u306a\u304b\u3063\u305f\u306e\u304b\u3068\u53cd\u7701\u3057\u307e\u3059\u304c\u3001PDF \u306e\u5185\u5bb9\u306f\u30d0\u30e9\u30a8\u30c6\u30a3\u306b\u5bcc\u307f\u3059\u304e\u3066\u304a\u308a\u3001\u300c\u6a5f\u68b0\u5224\u8aad\u306b\u9069\u3057\u305f\u300d\u5f62\u306b\u3059\u308b\u306b\u306f\u69d8\u3005\u306a\u30d1\u30bf\u30fc\u30f3\u3092\u7db2\u7f85\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u3001\u304b\u306a\u308a\u624b\u9593\u304c\u304b\u304b\u308a\u307e\u3059\u3002<\/p>\n<p>\u30aa\u30fc\u30d7\u30f3\u30c7\u30fc\u30bf\u306e\u516c\u958b\u306b\u95a2\u308f\u308b\u304b\u305f\u306b\u306f\u3001PDF \u3084 EXCEL \u3068\u5909\u308f\u3089\u306a\u3044\u624b\u9593\u3067\u516c\u958b\u3067\u304d\u3001CSV \u5f62\u5f0f\u3067\u306e\u662f\u975e\u691c\u8a0e\u3057\u3066\u3044\u305f\u3060\u304d\u305f\u3044\u3068\u3053\u308d\u3067\u306f\u3042\u308a\u307e\u3059\u3002\u305d\u308c\u3067\u306f\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u3055\u3066\u3001\u524d\u56de\u30aa\u30fc\u30d7\u30f3\u30c7\u30fc\u30bf\u306e PDF \u304b\u3089\u30c6\u30ad\u30b9\u30c8\u5f62\u5f0f\u306e\u30c7\u30fc\u30bf\u3092\u62bd\u51fa\u3044\u305f\u3057\u307e\u3057\u305f\u3002 \u3057\u304b\u3057\u3001\u3082\u3068\u3082\u3068\u8868\u5f62\u5f0f\u3060\u3063\u305f PDF \u304c\u3001\u3059\u3079\u3066\u306e\u30c7\u30fc\u30bf\u304c\u4e00\u5217\u306b\u4e26\u3093\u3060\u3001\u3068\u3066\u3082\u60b2\u60e8\u306a\u5f62\u5f0f\u306b\u306a\u3063\u3066\u3057\u307e\u3044\u307e\u3057\u305f\u3002\u3053\u308c\u3092\u3069\u3046\u306b\u304b\u8a08\u7b97\u3067\u304d\u308b\u5f62\u306b\u6301\u3063\u3066\u3044\u304d\u305f\u3044\u306e\u3067\u3059\u304c\u3001\u3058\u3064\u306f\u3001\u30e9\u30a4\u30d6\u30e9\u30ea\u3092\u3064\u304b\u3063\u3066\u3061\u3087\u3063\u3068\u3044\u3058\u308c\u3070\u3069\u3046\u306b\u304b\u306a\u308b\u3002\u3068\u3044\u3046\u3082\u306e\u3067\u306f\u3042\u308a\u307e\u305b\u3093\u3002<\/p>\n","protected":false},"author":1,"featured_media":12389,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[27],"tags":[],"acf":false,"_links":{"self":[{"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/posts\/12388"}],"collection":[{"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/comments?post=12388"}],"version-history":[{"count":4,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/posts\/12388\/revisions"}],"predecessor-version":[{"id":12460,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/posts\/12388\/revisions\/12460"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/media\/12389"}],"wp:attachment":[{"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/media?parent=12388"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/categories?post=12388"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/sitest.jp\/blog\/wp-json\/wp\/v2\/tags?post=12388"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}