url-parser.c

URL parsing library
git clone git://git.finwo.net/lib/url-parser.c
Log | Files | Refs | README | LICENSE

url-parser.c (11474B)


      1 /*_
      2  * Copyright (c) 2026 finwo
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a copy of
      5  * this software and associated documentation files (the "Software"), to use, copy,
      6  * modify, and distribute the Software, subject to the following conditions:
      7  *
      8  *  1. Redistributions of source code must retain the above copyright notice, this
      9  *     list of conditions, and the following disclaimer.
     10  *
     11  *  2. Redistributions in binary form, or any public offering of the Software
     12  *     (including hosted or managed services), must reproduce the above copyright
     13  *     notice, this list of conditions, and the following disclaimer in the
     14  *     documentation and/or other materials provided.
     15  *
     16  *  3. Any redistribution or public offering of the Software must clearly attribute
     17  *     the Software to the original copyright holder, reference this License, and
     18  *     include a link to the official project repository or website.
     19  *
     20  *  4. The Software may not be renamed, rebranded, or marketed in a manner that
     21  *     implies it is an independent or proprietary product. Derivative works must
     22  *     clearly state that they are based on the Software.
     23  *
     24  *  5. Modifications to copies of the Software must carry prominent notices stating
     25  *     that changes were made, the nature of the modifications, and the date of the
     26  *     modifications.
     27  *
     28  * Any violation of these conditions terminates the permissions granted herein.
     29  *
     30  * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     31  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
     32  * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT
     33  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     34  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     35  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     36  */
     37 
     38 #include "url-parser.h"
     39 
     40 #include <ctype.h>
     41 #include <limits.h>
     42 #include <stdio.h>
     43 #include <stdlib.h>
     44 #include <string.h>
     45 
     46 static const char *_last_error = NULL;
     47 
     48 const char *parse_url_last_error(void) {
     49   return _last_error;
     50 }
     51 
     52 #define SET_ERROR(msg)   \
     53   do {                   \
     54     _last_error = (msg); \
     55   } while (0)
     56 
     57 /*
     58  * Prototype declarations
     59  */
     60 static __inline__ int _is_scheme_char(int);
     61 
     62 /*
     63  * Check if scheme is a path-based scheme (unix socket, file path, etc.)
     64  */
     65 static __inline__ int _is_path_scheme(const char *scheme) {
     66   if (NULL == scheme) {
     67     return 0;
     68   }
     69   if (0 == strncmp(scheme, "unix", 4) && scheme[4] == '\0') {
     70     return 1;
     71   }
     72   if (0 == strncmp(scheme, "file", 4) && scheme[4] == '\0') {
     73     return 1;
     74   }
     75   if (0 == strncmp(scheme, "cunix", 5) && scheme[5] == '\0') {
     76     return 1;
     77   }
     78   return 0;
     79 }
     80 
     81 /*
     82  * Check whether the character is permitted in scheme string
     83  */
     84 static __inline__ int _is_scheme_char(int c) {
     85   return (!isalpha(c) && '+' != c && '-' != c && '.' != c) ? 0 : 1;
     86 }
     87 
     88 /*
     89  * See RFC 1738, 3986
     90  */
     91 struct parsed_url *parse_url(const char *url) {
     92   struct parsed_url *purl;
     93   const char        *tmpstr;
     94   const char        *curstr;
     95   size_t             len;
     96   size_t             i;
     97   int                bracket_flag;
     98   int                is_path;
     99 
    100   /* Allocate the parsed url storage */
    101   purl = malloc(sizeof(struct parsed_url));
    102   if (NULL == purl) {
    103     return NULL;
    104   }
    105   purl->scheme   = NULL;
    106   purl->host     = NULL;
    107   purl->port     = NULL;
    108   purl->path     = NULL;
    109   purl->query    = NULL;
    110   purl->fragment = NULL;
    111   purl->username = NULL;
    112   purl->password = NULL;
    113 
    114   curstr = url;
    115 
    116   /*
    117    * <scheme>:<scheme-specific-part>
    118    * <scheme> := [a-z\+\-\.]+
    119    *             upper case = lower case for resiliency
    120    */
    121   /* Read scheme */
    122   tmpstr = strchr(curstr, ':');
    123   if (NULL == tmpstr) {
    124     parsed_url_free(purl);
    125     return NULL;
    126   }
    127   /* Get the scheme length */
    128   len = tmpstr - curstr;
    129   if (len > 64) {
    130     SET_ERROR("ERR_SCHEME_TOO_LONG");
    131     parsed_url_free(purl);
    132     return NULL;
    133   }
    134   /* Check restrictions */
    135   for (i = 0; i < len; i++) {
    136     if (!_is_scheme_char(curstr[i])) {
    137       parsed_url_free(purl);
    138       return NULL;
    139     }
    140   }
    141   /* Copy the scheme to the storage */
    142   purl->scheme = malloc(sizeof(char) * (len + 1));
    143   if (NULL == purl->scheme) {
    144     parsed_url_free(purl);
    145     return NULL;
    146   }
    147   if (len > 0) {
    148     (void)strncpy(purl->scheme, curstr, len);
    149   }
    150   purl->scheme[len] = '\0';
    151   /* Make the character to lower if it is upper case. */
    152   for (i = 0; i < len; i++) {
    153     purl->scheme[i] = tolower(purl->scheme[i]);
    154   }
    155 
    156   /* Check if this is a path-based scheme */
    157   is_path = _is_path_scheme(purl->scheme);
    158 
    159   /* Skip ':' */
    160   tmpstr++;
    161   curstr = tmpstr;
    162 
    163   /*
    164    * Normalize: ensure we have // after scheme
    165    * If missing, treat everything as path
    166    */
    167   if ('/' != curstr[0] || '/' != curstr[1]) {
    168     /* No // - entire rest is path */
    169     tmpstr = curstr;
    170     while ('\0' != *tmpstr) {
    171       tmpstr++;
    172     }
    173     len = tmpstr - curstr;
    174     if (len > 0) {
    175       purl->path = malloc(sizeof(char) * (len + 1));
    176       if (NULL == purl->path) {
    177         parsed_url_free(purl);
    178         return NULL;
    179       }
    180       (void)strncpy(purl->path, curstr, len);
    181       purl->path[len] = '\0';
    182     }
    183     return purl;
    184   }
    185 
    186   /* Skip the "//" */
    187   curstr += 2;
    188 
    189   /*
    190    * Detect and consume username:password, consume @
    191    */
    192   tmpstr = curstr;
    193   while ('\0' != *tmpstr && '@' != *tmpstr) {
    194     tmpstr++;
    195   }
    196 
    197   if ('@' == *tmpstr) {
    198     /* Has userinfo */
    199     /* First check if there's a password (look for : before @) */
    200     const char *colon        = curstr;
    201     int         has_password = 0;
    202     while (colon < tmpstr) {
    203       if (':' == *colon) {
    204         has_password = 1;
    205         break;
    206       }
    207       colon++;
    208     }
    209 
    210     /* Read username */
    211     const char *username_start = curstr;
    212     if (has_password) {
    213       len = colon - curstr;
    214     } else {
    215       len = tmpstr - curstr;
    216     }
    217     if (len > 255) {
    218       SET_ERROR("ERR_USERNAME_TOO_LONG");
    219       parsed_url_free(purl);
    220       return NULL;
    221     }
    222     if (len > 0) {
    223       purl->username = malloc(sizeof(char) * (len + 1));
    224       if (NULL == purl->username) {
    225         parsed_url_free(purl);
    226         return NULL;
    227       }
    228       (void)strncpy(purl->username, username_start, len);
    229       purl->username[len] = '\0';
    230     }
    231 
    232     /* Skip to password or @ */
    233     if (has_password) {
    234       curstr = colon + 1;
    235       /* Read password */
    236       tmpstr = curstr;
    237       while ('\0' != *tmpstr && '@' != *tmpstr) {
    238         tmpstr++;
    239       }
    240       len = tmpstr - curstr;
    241       if (len > 255) {
    242         SET_ERROR("ERR_PASSWORD_TOO_LONG");
    243         parsed_url_free(purl);
    244         return NULL;
    245       }
    246       if (len > 0) {
    247         purl->password = malloc(sizeof(char) * (len + 1));
    248         if (NULL == purl->password) {
    249           parsed_url_free(purl);
    250           return NULL;
    251         }
    252         (void)strncpy(purl->password, curstr, len);
    253         purl->password[len] = '\0';
    254       }
    255       curstr = tmpstr;
    256     } else {
    257       /* No password - advance past username to @ */
    258       curstr = username_start + len;
    259     }
    260     /* Skip @ */
    261     while ('@' == *curstr) {
    262       curstr++;
    263     }
    264   }
    265 
    266   /*
    267    * If NOT a path scheme, detect and consume host:port
    268    */
    269   if (!is_path) {
    270     if ('[' == *curstr) {
    271       bracket_flag = 1;
    272       curstr++;
    273     } else {
    274       bracket_flag = 0;
    275     }
    276 
    277     /* Read host */
    278     tmpstr = curstr;
    279     while ('\0' != *tmpstr) {
    280       if (bracket_flag && ']' == *tmpstr) {
    281         break;
    282       } else if (!bracket_flag && (':' == *tmpstr || '/' == *tmpstr || '?' == *tmpstr || '#' == *tmpstr)) {
    283         break;
    284       }
    285       tmpstr++;
    286     }
    287     len = tmpstr - curstr;
    288     if (len > 255) {
    289       SET_ERROR("ERR_HOST_TOO_LONG");
    290       parsed_url_free(purl);
    291       return NULL;
    292     }
    293 
    294     if (len > 0) {
    295       purl->host = malloc(sizeof(char) * (len + 1));
    296       if (NULL == purl->host) {
    297         parsed_url_free(purl);
    298         return NULL;
    299       }
    300       (void)strncpy(purl->host, curstr, len);
    301       purl->host[len] = '\0';
    302     }
    303     curstr = tmpstr;
    304 
    305     /* Skip ']' if IPv6 */
    306     if (']' == *curstr) {
    307       curstr++;
    308     }
    309 
    310     /* Read port */
    311     if (':' == *curstr) {
    312       curstr++;
    313       tmpstr = curstr;
    314       while ('\0' != *tmpstr && '/' != *tmpstr && '?' != *tmpstr && '#' != *tmpstr) {
    315         tmpstr++;
    316       }
    317       len = tmpstr - curstr;
    318       if (len > 5) {
    319         SET_ERROR("ERR_PORT_TOO_LONG");
    320         parsed_url_free(purl);
    321         return NULL;
    322       }
    323       if (len > 0) {
    324         char   port_str[6];
    325         size_t copy_len = len < 5 ? len : 5;
    326         (void)strncpy(port_str, curstr, copy_len);
    327         port_str[copy_len] = '\0';
    328         long port_val      = strtol(port_str, NULL, 10);
    329         if (port_val < 1 || port_val > 65535) {
    330           SET_ERROR("ERR_PORT_INVALID");
    331           parsed_url_free(purl);
    332           return NULL;
    333         }
    334         purl->port = malloc(sizeof(char) * (len + 1));
    335         if (NULL == purl->port) {
    336           parsed_url_free(purl);
    337           return NULL;
    338         }
    339         (void)strncpy(purl->port, curstr, len);
    340         purl->port[len] = '\0';
    341       }
    342       curstr = tmpstr;
    343     }
    344   }
    345 
    346   /* End of string? */
    347   if ('\0' == *curstr) {
    348     return purl;
    349   }
    350 
    351   /* Parse path */
    352   tmpstr = curstr;
    353   while ('\0' != *tmpstr && '?' != *tmpstr && '#' != *tmpstr) {
    354     tmpstr++;
    355   }
    356   len = tmpstr - curstr;
    357   if (len > 4096) {
    358     SET_ERROR("ERR_PATH_TOO_LONG");
    359     parsed_url_free(purl);
    360     return NULL;
    361   }
    362   if (len > 0) {
    363     purl->path = malloc(sizeof(char) * (len + 1));
    364     if (NULL == purl->path) {
    365       parsed_url_free(purl);
    366       return NULL;
    367     }
    368     (void)strncpy(purl->path, curstr, len);
    369     purl->path[len] = '\0';
    370   }
    371   curstr = tmpstr;
    372 
    373   /* Parse query */
    374   if ('?' == *curstr) {
    375     curstr++;
    376     tmpstr = curstr;
    377     while ('\0' != *tmpstr && '#' != *tmpstr) {
    378       tmpstr++;
    379     }
    380     len = tmpstr - curstr;
    381     if (len > 4096) {
    382       SET_ERROR("ERR_QUERY_TOO_LONG");
    383       parsed_url_free(purl);
    384       return NULL;
    385     }
    386     if (len > 0) {
    387       purl->query = malloc(sizeof(char) * (len + 1));
    388       if (NULL == purl->query) {
    389         parsed_url_free(purl);
    390         return NULL;
    391       }
    392       (void)strncpy(purl->query, curstr, len);
    393       purl->query[len] = '\0';
    394     }
    395     curstr = tmpstr;
    396   }
    397 
    398   /* Parse fragment */
    399   if ('#' == *curstr) {
    400     curstr++;
    401     tmpstr = curstr;
    402     while ('\0' != *tmpstr) {
    403       tmpstr++;
    404     }
    405     len = tmpstr - curstr;
    406     if (len > 4096) {
    407       SET_ERROR("ERR_FRAGMENT_TOO_LONG");
    408       parsed_url_free(purl);
    409       return NULL;
    410     }
    411     if (len > 0) {
    412       purl->fragment = malloc(sizeof(char) * (len + 1));
    413       if (NULL == purl->fragment) {
    414         parsed_url_free(purl);
    415         return NULL;
    416       }
    417       (void)strncpy(purl->fragment, curstr, len);
    418       purl->fragment[len] = '\0';
    419     }
    420   }
    421 
    422   return purl;
    423 }
    424 
    425 /*
    426  * Free memory of parsed url
    427  */
    428 void parsed_url_free(struct parsed_url *purl) {
    429   if (NULL != purl) {
    430     if (NULL != purl->scheme) {
    431       free(purl->scheme);
    432     }
    433     if (NULL != purl->host) {
    434       free(purl->host);
    435     }
    436     if (NULL != purl->port) {
    437       free(purl->port);
    438     }
    439     if (NULL != purl->path) {
    440       free(purl->path);
    441     }
    442     if (NULL != purl->query) {
    443       free(purl->query);
    444     }
    445     if (NULL != purl->fragment) {
    446       free(purl->fragment);
    447     }
    448     if (NULL != purl->username) {
    449       free(purl->username);
    450     }
    451     if (NULL != purl->password) {
    452       free(purl->password);
    453     }
    454     free(purl);
    455   }
    456 }