{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert NOAA weahter data file \".dly\" to Pandas DataFrame\n",
    "\n",
    "Follow this instruction https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt\n",
    "\n",
    "Get data from ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily\n",
    "\n",
    "<!-- TEASER_END -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-03T06:16:48.354909Z",
     "start_time": "2018-02-03T06:16:45.219683Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import ftplib\n",
    "\n",
    "%matplotlib notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-03T06:16:48.369288Z",
     "start_time": "2018-02-03T06:16:48.356797Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# download data from FTP\n",
    "\n",
    "def download_file_from_ftp(FTP_SERVER,FTP_PATH,FILENAME):\n",
    "    with ftplib.FTP(FTP_SERVER) as ftp:\n",
    "        ftp.login()\n",
    "        ftp.cwd(FTP_PATH)\n",
    "        with open(FILENAME, 'wb') as f:\n",
    "            ftp.retrbinary('RETR ' + FILENAME, f.write)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Query station ID"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-03T06:16:48.407788Z",
     "start_time": "2018-02-03T06:16:48.371677Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def get_station_ID(station_to_find, filename):\n",
    "    for line in open(filename):\n",
    "        if station_to_find in line:\n",
    "            line_with_station=line\n",
    "            station_ID=re.split(\" \",line_with_station)[0]\n",
    "            return station_ID\n",
    "    return None\n",
    "# warning, it is slow, download it only once\n",
    "download_file_from_ftp(\"ftp.ncdc.noaa.gov\", \"/pub/data/ghcn/daily\", \"ghcnd-stations.txt\")\n",
    "\n",
    "station_to_find=\"GUANGZHOU\" # USE CAPS\n",
    "station_ID=get_station_ID(station_to_find, \"ghcnd-stations.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download weather data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-03T06:16:54.711515Z",
     "start_time": "2018-02-03T06:16:48.410687Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "weather_data_filename=station_ID+'.dly'\n",
    "\n",
    "# warning, it is slow, download it only once\n",
    "download_file_from_ftp(\"ftp.ncdc.noaa.gov\", \"/pub/data/ghcn/daily/all\", weather_data_filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert .dly to pandas Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-02-03T06:18:19.782253Z",
     "start_time": "2018-02-03T06:18:19.412362Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>YEAR</th>\n",
       "      <th>MONTH</th>\n",
       "      <th>ELEMENT</th>\n",
       "      <th>VALUE1</th>\n",
       "      <th>VALUE2</th>\n",
       "      <th>VALUE3</th>\n",
       "      <th>VALUE4</th>\n",
       "      <th>VALUE5</th>\n",
       "      <th>VALUE6</th>\n",
       "      <th>VALUE7</th>\n",
       "      <th>...</th>\n",
       "      <th>VALUE22</th>\n",
       "      <th>VALUE23</th>\n",
       "      <th>VALUE24</th>\n",
       "      <th>VALUE25</th>\n",
       "      <th>VALUE26</th>\n",
       "      <th>VALUE27</th>\n",
       "      <th>VALUE28</th>\n",
       "      <th>VALUE29</th>\n",
       "      <th>VALUE30</th>\n",
       "      <th>VALUE31</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1945</td>\n",
       "      <td>11</td>\n",
       "      <td>TAVG</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>107.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1945</td>\n",
       "      <td>12</td>\n",
       "      <td>TAVG</td>\n",
       "      <td>123.0</td>\n",
       "      <td>136.0</td>\n",
       "      <td>152.0</td>\n",
       "      <td>144.0</td>\n",
       "      <td>146.0</td>\n",
       "      <td>189.0</td>\n",
       "      <td>219.0</td>\n",
       "      <td>...</td>\n",
       "      <td>179.0</td>\n",
       "      <td>146.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>107.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>112.0</td>\n",
       "      <td>122.0</td>\n",
       "      <td>127.0</td>\n",
       "      <td>129.0</td>\n",
       "      <td>156.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1946</td>\n",
       "      <td>1</td>\n",
       "      <td>TAVG</td>\n",
       "      <td>150.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>123.0</td>\n",
       "      <td>117.0</td>\n",
       "      <td>112.0</td>\n",
       "      <td>121.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>...</td>\n",
       "      <td>146.0</td>\n",
       "      <td>153.0</td>\n",
       "      <td>173.0</td>\n",
       "      <td>196.0</td>\n",
       "      <td>211.0</td>\n",
       "      <td>212.0</td>\n",
       "      <td>218.0</td>\n",
       "      <td>201.0</td>\n",
       "      <td>156.0</td>\n",
       "      <td>131.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1946</td>\n",
       "      <td>2</td>\n",
       "      <td>TAVG</td>\n",
       "      <td>114.0</td>\n",
       "      <td>112.0</td>\n",
       "      <td>147.0</td>\n",
       "      <td>181.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>192.0</td>\n",
       "      <td>149.0</td>\n",
       "      <td>...</td>\n",
       "      <td>201.0</td>\n",
       "      <td>196.0</td>\n",
       "      <td>231.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>221.0</td>\n",
       "      <td>229.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1946</td>\n",
       "      <td>3</td>\n",
       "      <td>TAVG</td>\n",
       "      <td>237.0</td>\n",
       "      <td>162.0</td>\n",
       "      <td>142.0</td>\n",
       "      <td>133.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>187.0</td>\n",
       "      <td>160.0</td>\n",
       "      <td>...</td>\n",
       "      <td>183.0</td>\n",
       "      <td>192.0</td>\n",
       "      <td>205.0</td>\n",
       "      <td>216.0</td>\n",
       "      <td>223.0</td>\n",
       "      <td>238.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>195.0</td>\n",
       "      <td>233.0</td>\n",
       "      <td>228.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 34 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   YEAR  MONTH ELEMENT  VALUE1  VALUE2  VALUE3  VALUE4  VALUE5  VALUE6  \\\n",
       "0  1945     11    TAVG     NaN     NaN     NaN     NaN     NaN     NaN   \n",
       "1  1945     12    TAVG   123.0   136.0   152.0   144.0   146.0   189.0   \n",
       "2  1946      1    TAVG   150.0   150.0   123.0   117.0   112.0   121.0   \n",
       "3  1946      2    TAVG   114.0   112.0   147.0   181.0   195.0   192.0   \n",
       "4  1946      3    TAVG   237.0   162.0   142.0   133.0   183.0   187.0   \n",
       "\n",
       "   VALUE7   ...     VALUE22  VALUE23  VALUE24  VALUE25  VALUE26  VALUE27  \\\n",
       "0     NaN   ...         NaN      NaN      NaN      NaN      NaN      NaN   \n",
       "1   219.0   ...       179.0    146.0    128.0    107.0    104.0    112.0   \n",
       "2   125.0   ...       146.0    153.0    173.0    196.0    211.0    212.0   \n",
       "3   149.0   ...       201.0    196.0    231.0    226.0    221.0    229.0   \n",
       "4   160.0   ...       183.0    192.0    205.0    216.0    223.0    238.0   \n",
       "\n",
       "   VALUE28  VALUE29  VALUE30  VALUE31  \n",
       "0      NaN      NaN    107.0      NaN  \n",
       "1    122.0    127.0    129.0    156.0  \n",
       "2    218.0    201.0    156.0    131.0  \n",
       "3    240.0      NaN      NaN      NaN  \n",
       "4    207.0    195.0    233.0    228.0  \n",
       "\n",
       "[5 rows x 34 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def convert_dly_to_dataframe(dly_filename): \n",
    "    def build_dly_value_dict():\n",
    "        dly_value_dict_base={\n",
    "        \"VALUE\":[22,26],\n",
    "        \"MFLAG\":[27,27],\n",
    "        \"QFLAG\":[28,28],\n",
    "        \"SFLAG\":[29,29]}\n",
    "        dly_value_step=8\n",
    "        dly_value_dict={}\n",
    "        for day in range(31):\n",
    "            for k,v in dly_value_dict_base.items():\n",
    "                dly_value_dict[k+str(day+1)]=[v[0]+dly_value_step*day,v[1]+dly_value_step*day]\n",
    "        return dly_value_dict\n",
    "    \n",
    "    dly_dict={\n",
    "        \"ID\": [1,11],\n",
    "        \"YEAR\":[12,15], \n",
    "        \"MONTH\": [16,17], \n",
    "        \"ELEMENT\":[18,21],}\n",
    "    dly_dict.update(build_dly_value_dict())\n",
    "\n",
    "    columns_value=[\"VALUE\"+str(day+1) for day in range(31)]\n",
    "    columns=[\"YEAR\",\"MONTH\",\"ELEMENT\"]+columns_value\n",
    "\n",
    "    def extract_data_from_line(string,loc,astype=str):\n",
    "        return astype(string[loc[0]-1:loc[1]])\n",
    "\n",
    "    data=[]\n",
    "    for line in open(dly_filename):\n",
    "        data_item=[]\n",
    "        for k in columns:\n",
    "            data_item.append(extract_data_from_line(line,dly_dict[k]))\n",
    "        data.append(data_item)\n",
    "    df=(pd.DataFrame(data,columns=columns)\n",
    "         .apply(pd.to_numeric, errors='ignore')\n",
    "         .replace(-9999,np.nan))\n",
    "    return df\n",
    "\n",
    "df=convert_dly_to_dataframe(weather_data_filename)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "nikola": {
   "category": "",
   "date": "2018-2-3 14:00 UTC+08:00",
   "description": "",
   "link": "",
   "slug": "NOAA-weather-data-from-dly-to-pandas-df",
   "tags": "python",
   "title": "NOAA天气数据格式转换",
   "type": "text"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "70px",
    "width": "254px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}