/* * Unbind a service with its scheduler */ int ip_vs_unbind_scheduler(struct ip_vs_service *svc) { struct ip_vs_scheduler *sched; if (svc == NULL) { IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); return -EINVAL; } sched = svc->scheduler; if (sched == NULL) { IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); return -EINVAL; } if (sched->done_service) { if (sched->done_service(svc) != 0) { IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); return -EINVAL; } } svc->scheduler = NULL; return 0; }
/* * Bind a service with a scheduler */ int ip_vs_bind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *scheduler) { int ret; if (svc == NULL) { IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); return -EINVAL; } if (scheduler == NULL) { IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); return -EINVAL; } svc->scheduler = scheduler; if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); return ret; } } return 0; }
/* * Set up receiving multicast socket over UDP */ static struct socket * make_receive_sock(void) { struct socket *sock; /* First create a socket */ if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { IP_VS_ERR("Error during creation of socket; terminating\n"); return NULL; } /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->reuse = 1; if (sock->ops->bind(sock, (struct sockaddr*)&mcast_addr, sizeof(struct sockaddr)) < 0) { IP_VS_ERR("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ if (join_mcast_group(sock->sk, (struct in_addr*)&mcast_addr.sin_addr, ip_vs_mcast_ifn) < 0) { IP_VS_ERR("Error joining to the multicast group\n"); goto error; } return sock; error: sock_release(sock); return NULL; }
/* * Unregister a scheduler from the scheduler list */ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) { if (!scheduler) { IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); return -EINVAL; } write_lock_bh(&__ip_vs_sched_lock); if (scheduler->n_list.next == &scheduler->n_list) { write_unlock_bh(&__ip_vs_sched_lock); IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " "is not in the list. failed\n", scheduler->name); return -EINVAL; } /* * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); write_unlock_bh(&__ip_vs_sched_lock); MOD_DEC_USE_COUNT; IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); return 0; }
/* * Process received multicast message and create the corresponding * ip_vs_conn entries. */ static void ip_vs_process_message(const char *buffer, const size_t buflen) { struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; struct ip_vs_sync_conn *s; struct ip_vs_sync_conn_options *opt; struct ip_vs_conn *cp; char *p; int i; if (buflen != m->size) { IP_VS_ERR("bogus message\n"); return; } p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; i<m->nr_conns; i++) { s = (struct ip_vs_sync_conn *)p; cp = ip_vs_conn_in_get(s->protocol, s->caddr, s->cport, s->vaddr, s->vport); if (!cp) { cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, s->daddr, s->dport, ntohs(s->flags), NULL); if (!cp) { IP_VS_ERR("ip_vs_conn_new failed\n"); return; } cp->state = ntohs(s->state); } else if (!cp->dest) { /* it is an entry created by the synchronization */ cp->state = ntohs(s->state); cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED; } /* Note that we don't touch its state and flags if it is a normal entry. */ if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { opt = (struct ip_vs_sync_conn_options *)&s[1]; memcpy(&cp->in_seq, opt, sizeof(*opt)); p += FULL_CONN_SIZE; } else p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold); cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; ip_vs_conn_put(cp); if (p > buffer+buflen) { IP_VS_ERR("bogus message\n"); return; } } }
static void sync_master_loop(void) { struct socket *sock; struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg *m; /* create the sending multicast socket */ sock = make_send_sock(); if (!sock) return; for (;;) { while ((sb=sb_dequeue())) { m = sb->mesg; if (ip_vs_send_async(sock, (char *)m, m->size) != m->size) IP_VS_ERR("ip_vs_send_async error\n"); ip_vs_sync_buff_release(sb); } /* check if entries stay in curr_sb for 2 seconds */ if ((sb = get_curr_sync_buff(2*HZ))) { m = sb->mesg; if (ip_vs_send_async(sock, (char *)m, m->size) != m->size) IP_VS_ERR("ip_vs_send_async error\n"); ip_vs_sync_buff_release(sb); } if (stop_sync) break; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); __set_current_state(TASK_RUNNING); } /* clean up the sync_buff queue */ while ((sb=sb_dequeue())) { ip_vs_sync_buff_release(sb); } /* clean up the current sync_buff */ if ((sb = get_curr_sync_buff(0))) { ip_vs_sync_buff_release(sb); } /* release the sending multicast socket */ sock_release(sock); }
/* * Register a scheduler in the scheduler list */ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) { struct ip_vs_scheduler *sched; if (!scheduler) { IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); return -EINVAL; } if (!scheduler->name) { IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); return -EINVAL; } MOD_INC_USE_COUNT; /* * Make sure that the scheduler with this name doesn't exist * in the scheduler list. */ sched = ip_vs_sched_getbyname(scheduler->name); if (sched) { ip_vs_scheduler_put(sched); MOD_DEC_USE_COUNT; IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " "already existed in the system\n", scheduler->name); return -EINVAL; } write_lock_bh(&__ip_vs_sched_lock); if (scheduler->n_list.next != &scheduler->n_list) { write_unlock_bh(&__ip_vs_sched_lock); MOD_DEC_USE_COUNT; IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " "already linked\n", scheduler->name); return -EINVAL; } /* * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); write_unlock_bh(&__ip_vs_sched_lock); IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); return 0; }
static struct ip_vs_dest_list * ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) { struct ip_vs_dest_list *e; for (e=set->list; e!=NULL; e=e->next) { if (e->dest == dest) /* already existed */ return NULL; } e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) { IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); return NULL; } atomic_inc(&dest->refcnt); e->dest = dest; /* link it to the list */ e->next = set->list; set->list = e; atomic_inc(&set->size); set->lastmod = jiffies; return e; }
int start_sync_thread(int state, char *mcast_ifn) { DECLARE_COMPLETION(startup); pid_t pid; if (sync_pid) return -EEXIST; IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %d bytes\n", sizeof(struct ip_vs_sync_conn)); ip_vs_sync_state = state; strcpy(ip_vs_mcast_ifn, mcast_ifn); repeat: if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) { IP_VS_ERR("could not create fork_sync_thread due to %d... " "retrying.\n", pid); current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ); goto repeat; } wait_for_completion(&startup); return 0; }
/* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; struct iphdr *iph = ip_hdr(skb); struct ip_vs_dest *dest; __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, iph->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) return NULL; /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, skb, pptr); /* * Non-persistent service */ if (!svc->fwmark && pptr[1] != svc->port) { if (!svc->port) IP_VS_ERR("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } /* * Create a connection entry. */ cp = ip_vs_conn_new(iph->protocol, iph->saddr, pptr[0], iph->daddr, pptr[1], dest->addr, dest->port?dest->port:pptr[1], 0, dest); if (cp == NULL) return NULL; IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), NIPQUAD(cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; }
/* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. */ static struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) { struct ip_vs_conn *cp = NULL; struct ip_vs_dest *dest; const __u16 *portp; /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, iph); /* * Non-persistent service */ portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); if (!svc->fwmark && portp[1] != svc->port) { if (!svc->port) IP_VS_ERR("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } dest = svc->scheduler->schedule(svc, iph); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } /* * Create a connection entry. */ cp = ip_vs_conn_new(iph->protocol, iph->saddr, portp[0], iph->daddr, portp[1], dest->addr, dest->port?dest->port:portp[1], 0, dest); if (cp == NULL) return NULL; /* * Increase the inactive connection counter because it is in * Syn-Received state (inactive) when the connection is created. */ atomic_inc(&dest->inactconns); IP_VS_DBG(6, "Schedule fwd:%c s:%s c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), NIPQUAD(cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); return cp; }
static void sync_backup_loop(void) { struct socket *sock; char *buf; int len; if (!(buf=kmalloc(SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { IP_VS_ERR("sync_backup_loop: kmalloc error\n"); return; } /* create the receiving multicast socket */ sock = make_receive_sock(); if (!sock) goto out; for (;;) { /* do you have data now? */ while (!skb_queue_empty(&(sock->sk->receive_queue))) { if ((len=ip_vs_receive(sock, buf, SYNC_MESG_MAX_SIZE))<=0) { IP_VS_ERR("receiving message error\n"); break; } /* disable bottom half, because it accessed the data shared by softirq while getting/creating conns */ local_bh_disable(); ip_vs_process_message(buf, len); local_bh_enable(); } if (stop_sync) break; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); __set_current_state(TASK_RUNNING); } /* release the sending multicast socket */ sock_release(sock); out: kfree(buf); }
/* * Add an ip_vs_conn information into the current sync_buff. * Called by ip_vs_in. */ void ip_vs_sync_conn(struct ip_vs_conn *cp) { struct ip_vs_sync_mesg *m; struct ip_vs_sync_conn *s; int len; spin_lock(&curr_sb_lock); if (!curr_sb) { if (!(curr_sb=ip_vs_sync_buff_create())) { spin_unlock(&curr_sb_lock); IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); return; } } len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; m = curr_sb->mesg; s = (struct ip_vs_sync_conn *)curr_sb->head; /* copy members */ s->protocol = cp->protocol; s->cport = cp->cport; s->vport = cp->vport; s->dport = cp->dport; s->caddr = cp->caddr; s->vaddr = cp->vaddr; s->daddr = cp->daddr; s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); s->state = htons(cp->state); if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { struct ip_vs_sync_conn_options *opt = (struct ip_vs_sync_conn_options *)&s[1]; memcpy(opt, &cp->in_seq, sizeof(*opt)); } m->nr_conns++; m->size += len; curr_sb->head += len; /* check if there is a space for next one */ if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { sb_queue_tail(curr_sb); curr_sb = NULL; } spin_unlock(&curr_sb_lock); /* synchronize its controller if it has */ if (cp->control) ip_vs_sync_conn(cp->control); }
/* * Set up sending multicast socket over UDP */ static struct socket * make_send_sock(void) { struct socket *sock; /* First create a socket */ if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { IP_VS_ERR("Error during creation of socket; terminating\n"); return NULL; } if (set_mcast_if(sock->sk, ip_vs_mcast_ifn) < 0) { IP_VS_ERR("Error setting outbound mcast interface\n"); goto error; } set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); if (bind_mcastif_addr(sock, ip_vs_mcast_ifn) < 0) { IP_VS_ERR("Error binding address of the mcast interface\n"); goto error; } if (sock->ops->connect(sock, (struct sockaddr*)&mcast_addr, sizeof(struct sockaddr), 0) < 0) { IP_VS_ERR("Error connecting to the multicast addr\n"); goto error; } return sock; error: sock_release(sock); return NULL; }
static int fork_sync_thread(void *startup) { pid_t pid; /* fork the sync thread here, then the parent process of the sync thread is the init process after this thread exits. */ repeat: if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { IP_VS_ERR("could not create sync_thread due to %d... " "retrying.\n", pid); current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ); goto repeat; } return 0; }
/* * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. * returns bool success. */ static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { if (list_empty(&en->list)) { IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " "called from %p\n", __builtin_return_address(0)); return 0; } /* * Remove it from the table */ write_lock(&tbl->lock); list_del(&en->list); INIT_LIST_HEAD(&en->list); write_unlock(&tbl->lock); return 1; }
static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) { int i; struct ip_vs_lblcr_table *tbl; /* * Allocate the ip_vs_lblcr_table for this service */ tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); if (tbl == NULL) { IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); return -ENOMEM; } svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) allocated for " "current service\n", sizeof(struct ip_vs_lblcr_table)); /* * Initialize the hash buckets */ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { INIT_LIST_HEAD(&tbl->bucket[i]); } tbl->lock = RW_LOCK_UNLOCKED; tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; /* * Hook periodic timer for garbage collection */ init_timer(&tbl->periodic_timer); tbl->periodic_timer.data = (unsigned long)tbl; tbl->periodic_timer.function = ip_vs_lblcr_check_expire; tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; add_timer(&tbl->periodic_timer); #ifdef CONFIG_IP_VS_LBLCR_DEBUG lblcr_table_list = tbl; #endif return 0; }
/* * new/free a ip_vs_lblc_entry, which is a mapping of a destionation * IP address to a server. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) { struct ip_vs_lblc_entry *en; en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); if (en == NULL) { IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); return NULL; } INIT_LIST_HEAD(&en->list); en->addr = daddr; atomic_inc(&dest->refcnt); en->dest = dest; return en; }
/* * new/free a ip_vs_lblcr_entry, which is a mapping of a destination * IP address to a server. */ static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) { struct ip_vs_lblcr_entry *en; en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); if (en == NULL) { IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); return NULL; } INIT_LIST_HEAD(&en->list); en->addr = daddr; /* initilize its dest set */ atomic_set(&(en->set.size), 0); en->set.list = NULL; en->set.lock = RW_LOCK_UNLOCKED; return en; }
static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) { struct ip_vs_wrr_mark *mark; /* * Allocate the mark variable for WRR scheduling */ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); if (mark == NULL) { IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); return -ENOMEM; } mark->cl = &svc->destinations; mark->cw = 0; mark->mw = ip_vs_wrr_max_weight(svc); mark->di = ip_vs_wrr_gcd_weight(svc); svc->sched_data = mark; return 0; }
static int ip_vs_sh_init_svc(struct ip_vs_service *svc) { struct ip_vs_sh_bucket *tbl; /* allocate the SH table for this service */ tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, GFP_ATOMIC); if (tbl == NULL) { IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); return -ENOMEM; } svc->sched_data = tbl; IP_VS_DBG(6, "SH hash table (memory=%dbytes) allocated for " "current service\n", sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); /* assign the hash buckets with the updated service */ ip_vs_sh_assign(tbl, svc); return 0; }
/* * Hash an entry in the ip_vs_lblc_table. * returns bool success. */ static int ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { unsigned hash; if (!list_empty(&en->list)) { IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " "called from %p\n", __builtin_return_address(0)); return 0; } /* * Hash by destination IP address */ hash = ip_vs_lblc_hashkey(en->addr); write_lock(&tbl->lock); list_add(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); write_unlock(&tbl->lock); return 1; }
static int bind_mcastif_addr(struct socket *sock, char *ifname) { struct net_device *dev; u32 addr; struct sockaddr_in sin; if ((dev = __dev_get_by_name(ifname)) == NULL) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); if (!addr) IP_VS_ERR("You probably need to specify IP address on " "multicast interface.\n"); IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", ifname, NIPQUAD(addr)); /* Now bind the socket with the address of multicast interface */ sin.sin_family = AF_INET; sin.sin_addr.s_addr = addr; sin.sin_port = 0; return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); }
/* * Initialize IP Virtual Server */ static int __init ip_vs_init(void) { int ret; ret = ip_vs_control_init(); if (ret < 0) { IP_VS_ERR("can't setup control.\n"); goto cleanup_nothing; } ip_vs_protocol_init(); ret = ip_vs_app_init(); if (ret < 0) { IP_VS_ERR("can't setup application helper.\n"); goto cleanup_protocol; } ret = ip_vs_conn_init(); if (ret < 0) { IP_VS_ERR("can't setup connection table.\n"); goto cleanup_app; } ret = nf_register_hook(&ip_vs_in_ops); if (ret < 0) { IP_VS_ERR("can't register in hook.\n"); goto cleanup_conn; } ret = nf_register_hook(&ip_vs_out_ops); if (ret < 0) { IP_VS_ERR("can't register out hook.\n"); goto cleanup_inops; } ret = nf_register_hook(&ip_vs_post_routing_ops); if (ret < 0) { IP_VS_ERR("can't register post_routing hook.\n"); goto cleanup_outops; } ret = nf_register_hook(&ip_vs_forward_icmp_ops); if (ret < 0) { IP_VS_ERR("can't register forward_icmp hook.\n"); goto cleanup_postroutingops; } IP_VS_INFO("ipvs loaded.\n"); return ret; cleanup_postroutingops: nf_unregister_hook(&ip_vs_post_routing_ops); cleanup_outops: nf_unregister_hook(&ip_vs_out_ops); cleanup_inops: nf_unregister_hook(&ip_vs_in_ops); cleanup_conn: ip_vs_conn_cleanup(); cleanup_app: ip_vs_app_cleanup(); cleanup_protocol: ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); cleanup_nothing: return ret; }
/* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections, * forward to the right destination host if relevant. * Currently handles error types - unreachable, quench, ttl exceeded. * (Only used in VS/NAT) */ static int ip_vs_out_icmp(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph; struct icmphdr *icmph; struct iphdr *ciph; /* The ip header contained within the ICMP */ __u16 *pptr; /* port numbers from TCP/UDP contained header */ unsigned short ihl; unsigned short len; unsigned short clen, csize; struct ip_vs_conn *cp; /* reassemble IP fragments, but will it happen in ICMP packets?? */ if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { skb = ip_defrag(skb, IP_DEFRAG_VS_OUT); if (!skb) return NF_STOLEN; *skb_p = skb; } if (skb_is_nonlinear(skb)) { if (skb_linearize(skb, GFP_ATOMIC) != 0) return NF_DROP; ip_send_check(skb->nh.iph); } iph = skb->nh.iph; ihl = iph->ihl << 2; icmph = (struct icmphdr *)((char *)iph + ihl); len = ntohs(iph->tot_len) - ihl; if (len < sizeof(struct icmphdr)) return NF_DROP; IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", icmph->type, ntohs(icmp_id(icmph)), NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((icmph->type != ICMP_DEST_UNREACH) && (icmph->type != ICMP_SOURCE_QUENCH) && (icmph->type != ICMP_TIME_EXCEEDED)) return NF_ACCEPT; /* Now find the contained IP header */ clen = len - sizeof(struct icmphdr); if (clen < sizeof(struct iphdr)) return NF_DROP; ciph = (struct iphdr *) (icmph + 1); csize = ciph->ihl << 2; if (clen < csize) return NF_DROP; /* We are only interested ICMPs generated from TCP or UDP packets */ if (ciph->protocol != IPPROTO_UDP && ciph->protocol != IPPROTO_TCP) return NF_ACCEPT; /* Skip non-first embedded TCP/UDP fragments */ if (ciph->frag_off & __constant_htons(IP_OFFSET)) return NF_ACCEPT; /* We need at least TCP/UDP ports here */ if (clen < csize + sizeof(struct udphdr)) return NF_DROP; /* * Find the ports involved - this packet was * incoming so the ports are right way round * (but reversed relative to outer IP header!) */ pptr = (__u16 *)&(((char *)ciph)[csize]); /* Ensure the checksum is correct */ if (ip_compute_csum((unsigned char *) icmph, len)) { /* Failed checksum! */ IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); return NF_DROP; } IP_VS_DBG(11, "Handling outgoing ICMP for " "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n", NIPQUAD(ciph->saddr), ntohs(pptr[0]), NIPQUAD(ciph->daddr), ntohs(pptr[1])); /* ciph content is actually <protocol, caddr, cport, daddr, dport> */ cp = ip_vs_conn_out_get(ciph->protocol, ciph->daddr, pptr[1], ciph->saddr, pptr[0]); if (!cp) return NF_ACCEPT; if (IP_VS_FWD_METHOD(cp) != 0) { IP_VS_ERR("shouldn't reach here, because the box is on the" "half connection in the tun/dr module.\n"); } /* Now we do real damage to this packet...! */ /* First change the source IP address, and recalc checksum */ iph->saddr = cp->vaddr; ip_send_check(iph); /* Now change the *dest* address in the contained IP */ ciph->daddr = cp->vaddr; ip_send_check(ciph); /* the TCP/UDP dest port - cannot redo check */ pptr[1] = cp->vport; /* And finally the ICMP checksum */ icmph->checksum = 0; icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); skb->ip_summed = CHECKSUM_UNNECESSARY; /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); ip_vs_conn_put(cp); IP_VS_DBG(11, "Forwarding correct outgoing ICMP to " "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n", NIPQUAD(ciph->saddr), ntohs(pptr[0]), NIPQUAD(ciph->daddr), ntohs(pptr[1])); skb->nfcache |= NFC_IPVS_PROPERTY; return NF_ACCEPT; }
/* * Handle ICMP messages in the inside-to-outside direction (outgoing). * Find any that might be relevant, check against existing connections, * forward to the right destination host if relevant. * Currently handles error types - unreachable, quench, ttl exceeded. * (Only used in VS/NAT) */ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) { struct sk_buff *skb = *pskb; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_conn *cp; struct ip_vs_protocol *pp; unsigned int offset, ihl, verdict; *related = 1; /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); if (!skb) return NF_STOLEN; *pskb = skb; } iph = ip_hdr(skb); offset = ihl = iph->ihl * 4; ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); if (ic == NULL) return NF_DROP; IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", ic->type, ntohs(icmp_id(ic)), NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); /* * Work through seeing if this is for us. * These checks are supposed to be in an order that means easy * things are checked first to speed up processing.... however * this means that some packets will manage to get a long way * down this stack and then be rejected, but that's life. */ if ((ic->type != ICMP_DEST_UNREACH) && (ic->type != ICMP_SOURCE_QUENCH) && (ic->type != ICMP_TIME_EXCEEDED)) { *related = 0; return NF_ACCEPT; } /* Now find the contained IP header */ offset += sizeof(_icmph); cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ pp = ip_vs_proto_get(cih->protocol); if (!pp) return NF_ACCEPT; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && pp->dont_defrag)) return NF_ACCEPT; IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); offset += cih->ihl * 4; /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_out_get(skb, pp, cih, offset, 1); if (!cp) return NF_ACCEPT; verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != 0) { IP_VS_ERR("shouldn't reach here, because the box is on the" "half connection in the tun/dr module.\n"); } /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { /* Failed checksum! */ IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); goto out; } if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) offset += 2 * sizeof(__u16); if (!ip_vs_make_skb_writable(pskb, offset)) goto out; skb = *pskb; ip_vs_nat_icmp(skb, pp, cp, 1); /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); skb->ipvs_property = 1; verdict = NF_ACCEPT; out: __ip_vs_conn_put(cp); return verdict; }