/* Initializes the vma tracker. */ static int _stp_vma_init(void) { int rc = 0; #if defined(CONFIG_UTRACE) static struct stap_task_finder_target vmcb = { // NB: no .pid, no .procname filters here. // This means that we get a system-wide mmap monitoring // widget while the script is running. (The // system-wideness may be restricted by stap -c or // -x.) But this seems to be necessary if we want to // to stack tracebacks through arbitrary shared libraries. // // XXX: There may be an optimization opportunity // for executables (for which the main task-finder // callback should be sufficient). .pid = 0, .procname = NULL, .callback = &_stp_vma_exec_cb, .mmap_callback = &_stp_vma_mmap_cb, .munmap_callback = &_stp_vma_munmap_cb, .mprotect_callback = NULL }; stap_initialize_vma_map (); #ifdef DEBUG_TASK_FINDER_VMA _stp_dbug(__FUNCTION__, __LINE__, "registering vmcb (_stap_target: %d)\n", _stp_target); #endif rc = stap_register_task_finder_target (& vmcb); if (rc != 0) _stp_error("Couldn't register task finder target: %d\n", rc); #endif return rc; }
static int _stp_build_id_check (struct _stp_module *m, unsigned long notes_addr, struct task_struct *tsk) { int j; for (j = 0; j < m->build_id_len; j++) { /* Use set_fs / get_user to access conceivably invalid addresses. * If loc2c-runtime.h were more easily usable, a deref() loop * could do it too. */ mm_segment_t oldfs = get_fs(); int rc; unsigned char theory, practice = 0; #ifdef STAPCONF_PROBE_KERNEL if (!tsk) { theory = m->build_id_bits[j]; set_fs(KERNEL_DS); rc = probe_kernel_read(&practice, (void*)(notes_addr + j), 1); } else #endif { theory = m->build_id_bits[j]; set_fs (tsk ? USER_DS : KERNEL_DS); /* * Why check CONFIG_UTRACE here? If we're using real in-kernel * utrace, we can always just call get_user() (since we're * either reading kernel memory or tsk == current). * * Since we're only reading here, we can call * __access_process_vm_noflush(), which only calls things that * are exported. */ #ifdef CONFIG_UTRACE rc = get_user(practice, ((unsigned char*)(void*)(notes_addr + j))); #else if (!tsk || tsk == current) { rc = get_user(practice, ((unsigned char*)(void*)(notes_addr + j))); } else { rc = (__access_process_vm_noflush(tsk, (notes_addr + j), &practice, 1, 0) != 1); } #endif } set_fs(oldfs); if (rc || (theory != practice)) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) _stp_error ("Build-id mismatch [man error::buildid]: \"%s\" byte %d (0x%02x vs 0x%02x) address %#lx rc %d\n", m->path, j, theory, practice, notes_addr, rc); return 1; #else /* This branch is a surrogate for kernels affected by Fedora bug * #465873. */ _stp_warn (KERN_WARNING "Build-id mismatch [man error::buildid]: \"%s\" byte %d (0x%02x vs 0x%02x) rc %d\n", m->path, j, theory, practice, rc); #endif break; } /* end mismatch */ } /* end per-byte check loop */ return 0; }
static int stap_uprobe_change_plus (struct task_struct *tsk, unsigned long relocation, unsigned long length, const struct stap_uprobe_tf *stf, unsigned long offset, unsigned long vm_flags) { int tfi = (stf - stap_uprobe_finders); int spec_index; /* iterate over stap_uprobe_spec[] that use this same stap_uprobe_tf */ for (spec_index=0; spec_index<sizeof(stap_uprobe_specs)/sizeof(stap_uprobe_specs[0]); spec_index++) { int handled_p = 0; int slotted_p = 0; const struct stap_uprobe_spec *sups = &stap_uprobe_specs [spec_index]; struct stap_uprobe *sup; pid_t sdt_sem_pid; int rc = 0; int i; int pci; if (likely(sups->tfi != tfi)) continue; /* skip probes with an address beyond this map event; should not happen unless a shlib/exec got mmapped in weirdly piecemeal */ if (likely((vm_flags & VM_EXEC) && sups->address >= length)) continue; /* Found a uprobe_spec for this stap_uprobe_tf. Need to lock the stap_uprobes[] array to allocate a free spot, but then we can unlock and do the register_*probe subsequently. */ mutex_lock (& stap_uprobes_lock); for (i=0; i<MAXUPROBES; i++) { /* XXX: slow linear search */ sup = & stap_uprobes[i]; /* register new uprobe We make two passes for semaphores; see stap_uprobe_change_semaphore_plus */ if (sup->spec_index < 0 || (sups->sdt_sem_offset && vm_flags & VM_WRITE && sup->spec_index == spec_index)) { #if (UPROBES_API_VERSION < 2) /* See PR6829 comment. */ if (sup->spec_index == -1 && sup->up.kdata != NULL) continue; else if (sup->spec_index == -2 && sup->urp.u.kdata != NULL) continue; #endif sup->spec_index = spec_index; slotted_p = 1; break; } } mutex_unlock (& stap_uprobes_lock); #ifdef DEBUG_UPROBES _stp_dbug(__FUNCTION__,__LINE__, "+uprobe spec %d idx %d process %s[%d] addr %p pp %s\n", spec_index, (slotted_p ? i : -1), tsk->comm, tsk->tgid, (void*)(relocation+sups->address), sups->probe->pp); #endif /* NB: check for user-module build-id only if we have a pathname at all; for a process(PID#).* probe, we may not. If at some point we map process(PID#) to process("/proc/PID#/exe"), we'll get a pathname. */ if (stf->pathname) if ((rc = _stp_usermodule_check(tsk, stf->pathname, relocation))) return rc; /* Here, slotted_p implies that `i' points to the single stap_uprobes[] element that has been slotted in for registration or unregistration processing. !slotted_p implies that the table was full (registration; MAXUPROBES) or that no matching entry was found (unregistration; should not happen). */ sdt_sem_pid = (sups->return_p ? sup->urp.u.pid : sup->up.pid); if (sups->sdt_sem_offset && (sdt_sem_pid != tsk->tgid || sup->sdt_sem_address == 0)) { /* If the probe is in an ET_EXEC binary, then the sdt_sem_offset already * is a real address. But stap_uprobe_process_found calls us in this * case with relocation=offset=0, so we don't have to worry about it. */ sup->sdt_sem_address = (relocation - offset) + sups->sdt_sem_offset; } /* sdt_sem_offset */ for (pci=0; pci < sups->perf_counters_dim; pci++) { if ((sups->perf_counters)[pci] > -1) _stp_perf_read_init ((sups->perf_counters)[pci], tsk); } if (slotted_p) { struct stap_uprobe *sup = & stap_uprobes[i]; if (sups->return_p) { sup->urp.u.pid = tsk->tgid; sup->urp.u.vaddr = relocation + sups->address; sup->urp.handler = &enter_uretprobe_probe; rc = register_uretprobe (& sup->urp); } else { sup->up.pid = tsk->tgid; sup->up.vaddr = relocation + sups->address; sup->up.handler = &enter_uprobe_probe; rc = register_uprobe (& sup->up); } /* The u*probe failed to register. However, if we got EEXIST, * that means that the u*probe is already there, so just ignore * the error. This could happen if CLONE_THREAD or CLONE_VM was * used. */ if (rc != 0 && rc != -EEXIST) { _stp_warn ("u*probe failed %s[%d] '%s' addr %p rc %d\n", tsk->comm, tsk->tgid, sups->probe->pp, (void*)(relocation + sups->address), rc); /* NB: we need to release this slot, so we need to borrow the mutex temporarily. */ mutex_lock (& stap_uprobes_lock); sup->spec_index = -1; sup->sdt_sem_address = 0; mutex_unlock (& stap_uprobes_lock); } else { handled_p = 1; } } /* NB: handled_p implies slotted_p */ if (unlikely (! handled_p)) { #ifdef STP_TIMING atomic_inc (skipped_count_uprobe_reg()); #endif /* NB: duplicates common_entryfn_epilogue, but then this is not a probe entry fn epilogue. */ #ifndef STAP_SUPPRESS_HANDLER_ERRORS if (unlikely (atomic_inc_return (skipped_count()) > MAXSKIPPED)) { if (unlikely (pseudo_atomic_cmpxchg(session_state(), STAP_SESSION_RUNNING, STAP_SESSION_ERROR) == STAP_SESSION_RUNNING)) _stp_error ("Skipped too many probes, check MAXSKIPPED or try again with stap -t for more details."); } #endif } } /* close iteration over stap_uprobe_spec[] */ return 0; /* XXX: or rc? */ }
static int stap_uprobe_change_plus (struct task_struct *tsk, unsigned long relocation, unsigned long length, const struct stap_uprobe_tf *stf, unsigned long offset, unsigned long vm_flags) { int tfi = (stf - stap_uprobe_finders); int spec_index; /* iterate over stap_uprobe_spec[] that use this same stap_uprobe_tf */ for (spec_index=0; spec_index<sizeof(stap_uprobe_specs)/sizeof(stap_uprobe_specs[0]); spec_index++) { int handled_p = 0; int slotted_p = 0; const struct stap_uprobe_spec *sups = &stap_uprobe_specs [spec_index]; struct stap_uprobe *sup; pid_t sdt_sem_pid; int rc = 0; int i; if (likely(sups->tfi != tfi)) continue; /* skip probes with an address beyond this map event; should not happen unless a shlib/exec got mmapped in weirdly piecemeal */ if (likely((vm_flags & VM_EXEC) && sups->address >= length)) continue; /* Found a uprobe_spec for this stap_uprobe_tf. Need to lock the stap_uprobes[] array to allocate a free spot, but then we can unlock and do the register_*probe subsequently. */ mutex_lock (& stap_uprobes_lock); for (i=0; i<MAXUPROBES; i++) { /* XXX: slow linear search */ sup = & stap_uprobes[i]; /* register new uprobe We make two passes for semaphores; see _stap_uprobe_change_semaphore_plus */ if (sup->spec_index < 0 || (sups->sdt_sem_offset && vm_flags & VM_WRITE && sup->spec_index == spec_index)) { #if (UPROBES_API_VERSION < 2) /* See PR6829 comment. */ if (sup->spec_index == -1 && sup->up.kdata != NULL) continue; else if (sup->spec_index == -2 && sup->urp.u.kdata != NULL) continue; #endif sup->spec_index = spec_index; slotted_p = 1; break; } } mutex_unlock (& stap_uprobes_lock); #ifdef DEBUG_UPROBES _stp_dbug(__FUNCTION__,__LINE__, "+uprobe spec %d idx %d process %s[%d] addr %p pp %s\n", spec_index, (slotted_p ? i : -1), tsk->comm, tsk->tgid, (void*)(relocation+sups->address), sups->probe.pp); #endif /* Here, slotted_p implies that `i' points to the single stap_uprobes[] element that has been slotted in for registration or unregistration processing. !slotted_p implies that the table was full (registration; MAXUPROBES) or that no matching entry was found (unregistration; should not happen). */ sdt_sem_pid = (sups->return_p ? sup->urp.u.pid : sup->up.pid); if (sups->sdt_sem_offset && (sdt_sem_pid != tsk->tgid || sup->sdt_sem_address == 0)) { /* If the probe is in the executable itself, the offset *is* the address. */ if (vm_flags & VM_EXECUTABLE) { sup->sdt_sem_address = relocation + sups->sdt_sem_offset; } else { sup->sdt_sem_address = (relocation - offset) + sups->sdt_sem_offset; } } /* sdt_sem_offset */ if (slotted_p) { struct stap_uprobe *sup = & stap_uprobes[i]; if (sups->return_p) { sup->urp.u.pid = tsk->tgid; sup->urp.u.vaddr = relocation + sups->address; sup->urp.handler = &enter_uretprobe_probe; rc = register_uretprobe (& sup->urp); } else { sup->up.pid = tsk->tgid; sup->up.vaddr = relocation + sups->address; sup->up.handler = &enter_uprobe_probe; rc = register_uprobe (& sup->up); } if (rc) { /* failed to register */ _stp_warn ("u*probe failed %s[%d] '%s' addr %p rc %d\n", tsk->comm, tsk->tgid, sups->probe.pp, (void*)(relocation + sups->address), rc); /* NB: we need to release this slot, so we need to borrow the mutex temporarily. */ mutex_lock (& stap_uprobes_lock); sup->spec_index = -1; mutex_unlock (& stap_uprobes_lock); } else { handled_p = 1; } } /* NB: handled_p implies slotted_p */ if (unlikely (! handled_p)) { #ifdef STP_TIMING atomic_inc (& skipped_count_uprobe_reg); #endif /* NB: duplicates common_entryfn_epilogue, but then this is not a probe entry fn epilogue. */ if (unlikely (atomic_inc_return (& skipped_count) > MAXSKIPPED)) { if (unlikely (pseudo_atomic_cmpxchg(& session_state, STAP_SESSION_RUNNING, STAP_SESSION_ERROR) == STAP_SESSION_RUNNING)) _stp_error ("Skipped too many probes, check MAXSKIPPED or try again with stap -t for more details."); } } } /* close iteration over stap_uprobe_spec[] */ return 0; /* XXX: or rc? */ }
/* mmap callback, will match new vma with _stp_module or register vma name. */ static int _stp_vma_mmap_cb(struct stap_task_finder_target *tgt, struct task_struct *tsk, char *path, struct dentry *dentry, unsigned long addr, unsigned long length, unsigned long offset, unsigned long vm_flags) { int i, res; struct _stp_module *module = NULL; const char *name = ((dentry != NULL) ? (char *)dentry->d_name.name : NULL); if (path == NULL || *path == '\0') /* unknown? */ path = (char *)name; /* we'll copy this soon, in ..._add_vma_... */ dbug_task_vma(1, "mmap_cb: tsk %d:%d path %s, addr 0x%08lx, length 0x%08lx, offset 0x%lx, flags 0x%lx\n", tsk->pid, tsk->tgid, path, addr, length, offset, vm_flags); // We are only interested in the first load of the whole module that // is executable. We register whether or not we know the module, // so we can later lookup the name given an address for this task. if (path != NULL && offset == 0 && (vm_flags & VM_EXEC) && stap_find_vma_map_info(tsk, addr, NULL, NULL, NULL, NULL) != 0) { for (i = 0; i < _stp_num_modules; i++) { if (strcmp(path, _stp_modules[i]->path) == 0) { unsigned long vm_start = 0; unsigned long vm_end = 0; dbug_task_vma(1, "vm_cb: matched path %s to module (sec: %s)\n", path, _stp_modules[i]->sections[0].name); module = _stp_modules[i]; /* Make sure we really don't know about this module yet. If we do know, we might want to extend the coverage. */ res = stap_find_vma_map_info_user(tsk->group_leader, module, &vm_start, &vm_end, NULL); if (res == -ESRCH) res = stap_add_vma_map_info(tsk->group_leader, addr, addr + length, path, module); else if (res == 0 && vm_end + 1 == addr) res = stap_extend_vma_map_info(tsk->group_leader, vm_start, addr + length); /* VMA entries are allocated dynamically, this is fine, * since we are in a task_finder callback, which is in * user context. */ if (res != 0) { _stp_error ("Couldn't register module '%s' for pid %d (%d)\n", _stp_modules[i]->path, tsk->group_leader->pid, res); } return 0; } } /* None of the tracked modules matched, register without, * to make sure we can lookup the name later. Ignore errors, * we will just report unknown when asked and tables were * full. Restrict to target process when given to preserve * vma_map entry slots. */ if (_stp_target == 0 || _stp_target == tsk->group_leader->pid) { res = stap_add_vma_map_info(tsk->group_leader, addr, addr + length, path, NULL); dbug_task_vma(1, "registered '%s' for %d (res:%d) [%lx-%lx]\n", path, tsk->group_leader->pid, res, addr, addr + length); } } else if (path != NULL) { // Once registered, we may want to extend an earlier // registered region. A segment might be mapped with // different flags for different offsets. If so we want // to record the extended range so we can address more // precisely to module names and symbols. res = stap_extend_vma_map_info(tsk->group_leader, addr, addr + length); dbug_task_vma(1, "extended '%s' for %d (res:%d) [%lx-%lx]\n", path, tsk->group_leader->pid, res, addr, addr + length); } return 0; }
static int _stp_create_procfs(const char *path, int num, const struct file_operations *fops, int perm, void *data) { const char *p; char *next; struct proc_dir_entry *last_dir, *de; if (num >= STP_MAX_PROCFS_FILES) { _stp_error("Requested file number %d is larger than max (%d)\n", num, STP_MAX_PROCFS_FILES); return -1; } last_dir = _stp_proc_root; /* if no path, use default one */ if (strlen(path) == 0) p = "command"; else p = path; #ifdef _STP_ALLOW_PROCFS_PATH_SUBDIRS while ((next = strchr(p, '/'))) { if (_stp_num_pde == STP_MAX_PROCFS_FILES) goto too_many; *next = 0; de = _stp_procfs_lookup(p, last_dir); if (de == NULL) { last_dir = proc_mkdir(p, last_dir); if (!last_dir) { _stp_error("Could not create directory \"%s\"\n", p); goto err; } _stp_pde[_stp_num_pde++] = last_dir; #ifdef STAPCONF_PROCFS_OWNER last_dir->owner = THIS_MODULE; #endif proc_set_user(last_dir, KUIDT_INIT(_stp_uid), KGIDT_INIT(_stp_gid)); } else { last_dir = de; } p = next + 1; } #else /* !_STP_ALLOW_PROCFS_PATH_SUBDIRS */ if (strchr(p, '/') != NULL) { _stp_error("Could not create path \"%s\"," " contains subdirectories\n", p); goto err; } #endif /* !_STP_ALLOW_PROCFS_PATH_SUBDIRS */ if (_stp_num_pde == STP_MAX_PROCFS_FILES) goto too_many; de = proc_create_data(p, perm, last_dir, fops, data); if (de == NULL) { _stp_error("Could not create file \"%s\" in path \"%s\"\n", p, path); goto err; } #ifdef STAPCONF_PROCFS_OWNER de->owner = THIS_MODULE; #endif proc_set_user(de, KUIDT_INIT(_stp_uid), KGIDT_INIT(_stp_gid)); _stp_pde[_stp_num_pde++] = de; return 0; too_many: _stp_error("Attempted to open too many procfs files. Maximum is %d\n", STP_MAX_PROCFS_FILES); err: _stp_close_procfs(); return -1; }