{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Convert NOAA weahter data file \".dly\" to Pandas DataFrame\n", "\n", "Follow this instruction https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt\n", "\n", "Get data from ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily\n", "\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-02-03T06:16:48.354909Z", "start_time": "2018-02-03T06:16:45.219683Z" }, "collapsed": true }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import ftplib\n", "\n", "%matplotlib notebook" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-02-03T06:16:48.369288Z", "start_time": "2018-02-03T06:16:48.356797Z" }, "collapsed": true }, "outputs": [], "source": [ "# download data from FTP\n", "\n", "def download_file_from_ftp(FTP_SERVER,FTP_PATH,FILENAME):\n", " with ftplib.FTP(FTP_SERVER) as ftp:\n", " ftp.login()\n", " ftp.cwd(FTP_PATH)\n", " with open(FILENAME, 'wb') as f:\n", " ftp.retrbinary('RETR ' + FILENAME, f.write)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Query station ID" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2018-02-03T06:16:48.407788Z", "start_time": "2018-02-03T06:16:48.371677Z" }, "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "def get_station_ID(station_to_find, filename):\n", " for line in open(filename):\n", " if station_to_find in line:\n", " line_with_station=line\n", " station_ID=re.split(\" \",line_with_station)[0]\n", " return station_ID\n", " return None\n", "# warning, it is slow, download it only once\n", "download_file_from_ftp(\"ftp.ncdc.noaa.gov\", \"/pub/data/ghcn/daily\", \"ghcnd-stations.txt\")\n", "\n", "station_to_find=\"GUANGZHOU\" # USE CAPS\n", "station_ID=get_station_ID(station_to_find, \"ghcnd-stations.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download weather data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-02-03T06:16:54.711515Z", "start_time": "2018-02-03T06:16:48.410687Z" }, "collapsed": true }, "outputs": [], "source": [ "weather_data_filename=station_ID+'.dly'\n", "\n", "# warning, it is slow, download it only once\n", "download_file_from_ftp(\"ftp.ncdc.noaa.gov\", \"/pub/data/ghcn/daily/all\", weather_data_filename)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Convert .dly to pandas Dataframe" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2018-02-03T06:18:19.782253Z", "start_time": "2018-02-03T06:18:19.412362Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
YEARMONTHELEMENTVALUE1VALUE2VALUE3VALUE4VALUE5VALUE6VALUE7...VALUE22VALUE23VALUE24VALUE25VALUE26VALUE27VALUE28VALUE29VALUE30VALUE31
0194511TAVGNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN107.0NaN
1194512TAVG123.0136.0152.0144.0146.0189.0219.0...179.0146.0128.0107.0104.0112.0122.0127.0129.0156.0
219461TAVG150.0150.0123.0117.0112.0121.0125.0...146.0153.0173.0196.0211.0212.0218.0201.0156.0131.0
319462TAVG114.0112.0147.0181.0195.0192.0149.0...201.0196.0231.0226.0221.0229.0240.0NaNNaNNaN
419463TAVG237.0162.0142.0133.0183.0187.0160.0...183.0192.0205.0216.0223.0238.0207.0195.0233.0228.0
\n", "

5 rows × 34 columns

\n", "
" ], "text/plain": [ " YEAR MONTH ELEMENT VALUE1 VALUE2 VALUE3 VALUE4 VALUE5 VALUE6 \\\n", "0 1945 11 TAVG NaN NaN NaN NaN NaN NaN \n", "1 1945 12 TAVG 123.0 136.0 152.0 144.0 146.0 189.0 \n", "2 1946 1 TAVG 150.0 150.0 123.0 117.0 112.0 121.0 \n", "3 1946 2 TAVG 114.0 112.0 147.0 181.0 195.0 192.0 \n", "4 1946 3 TAVG 237.0 162.0 142.0 133.0 183.0 187.0 \n", "\n", " VALUE7 ... VALUE22 VALUE23 VALUE24 VALUE25 VALUE26 VALUE27 \\\n", "0 NaN ... NaN NaN NaN NaN NaN NaN \n", "1 219.0 ... 179.0 146.0 128.0 107.0 104.0 112.0 \n", "2 125.0 ... 146.0 153.0 173.0 196.0 211.0 212.0 \n", "3 149.0 ... 201.0 196.0 231.0 226.0 221.0 229.0 \n", "4 160.0 ... 183.0 192.0 205.0 216.0 223.0 238.0 \n", "\n", " VALUE28 VALUE29 VALUE30 VALUE31 \n", "0 NaN NaN 107.0 NaN \n", "1 122.0 127.0 129.0 156.0 \n", "2 218.0 201.0 156.0 131.0 \n", "3 240.0 NaN NaN NaN \n", "4 207.0 195.0 233.0 228.0 \n", "\n", "[5 rows x 34 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def convert_dly_to_dataframe(dly_filename): \n", " def build_dly_value_dict():\n", " dly_value_dict_base={\n", " \"VALUE\":[22,26],\n", " \"MFLAG\":[27,27],\n", " \"QFLAG\":[28,28],\n", " \"SFLAG\":[29,29]}\n", " dly_value_step=8\n", " dly_value_dict={}\n", " for day in range(31):\n", " for k,v in dly_value_dict_base.items():\n", " dly_value_dict[k+str(day+1)]=[v[0]+dly_value_step*day,v[1]+dly_value_step*day]\n", " return dly_value_dict\n", " \n", " dly_dict={\n", " \"ID\": [1,11],\n", " \"YEAR\":[12,15], \n", " \"MONTH\": [16,17], \n", " \"ELEMENT\":[18,21],}\n", " dly_dict.update(build_dly_value_dict())\n", "\n", " columns_value=[\"VALUE\"+str(day+1) for day in range(31)]\n", " columns=[\"YEAR\",\"MONTH\",\"ELEMENT\"]+columns_value\n", "\n", " def extract_data_from_line(string,loc,astype=str):\n", " return astype(string[loc[0]-1:loc[1]])\n", "\n", " data=[]\n", " for line in open(dly_filename):\n", " data_item=[]\n", " for k in columns:\n", " data_item.append(extract_data_from_line(line,dly_dict[k]))\n", " data.append(data_item)\n", " df=(pd.DataFrame(data,columns=columns)\n", " .apply(pd.to_numeric, errors='ignore')\n", " .replace(-9999,np.nan))\n", " return df\n", "\n", "df=convert_dly_to_dataframe(weather_data_filename)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "nikola": { "category": "", "date": "2018-2-3 14:00 UTC+08:00", "description": "", "link": "", "slug": "NOAA-weather-data-from-dly-to-pandas-df", "tags": "python", "title": "NOAA天气数据格式转换", "type": "text" }, "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "70px", "width": "254px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }