w_rc_t sthread_t::set_bufsize(size_t size, char *&buf_start /* in/out*/, bool #ifdef HAVE_HUGETLBFS // This argument is used only by the unit tests. use_normal_if_huge_fails /*=false*/ #endif ) { if (_disk_buffer && size == 0) { do_unmap(); return RCOK; } if (_disk_buffer) { std::cerr << "Can't re-allocate disk buffer without disabling" << std::endl; return RC(fcINTERNAL); } buf_start = 0; long system_page_size = sysconf(_SC_PAGESIZE); #ifdef WITHOUT_MMAP // If the user configured --without-mmap, then don't even // bother with the mmap attempts below. return set_bufsize_memalign(size, buf_start, system_page_size); #endif #ifdef HAVE_HUGETLBFS // Ok, we have to have configured for hugefs AND we have to // have set a path for it. If we have no path string, // we have chosen not to use hugetlbfs. This is the result // of setting run-time options sm_hugetlbfs_path to "NULL". // So if we've set the path to "NULL", we will just use the // "normal way". if(hugefs_path != NULL) { w_rc_t rc = set_bufsize_huge(size, buf_start, system_page_size); if( !rc.is_error() ) { #if W_DEBUG_LEVEL > 10 cout << "Using hugetlbfs size " << size << " system_page_size " << system_page_size << " path " << hugefs_path << ". " << std::endl; #endif return rc; } if(!use_normal_if_huge_fails) { return rc; } // else, try the other way std::cerr << "Skipping hugetlbfs sue to mmap failure: " << rc << std::endl; } else { cout << "Skipping hugetlbfs based on user option. " << std::endl; } #endif return set_bufsize_normal(size, buf_start, system_page_size); }
errval_t page_mappings_unmap(struct capability *pgtable, struct cte *mapping, size_t slot, size_t num_pages) { assert(type_is_vnode(pgtable->type)); errval_t err; debug(SUBSYS_PAGING, "page_mappings_unmap(%zd pages)\n", num_pages); // get page table entry data genpaddr_t paddr; read_pt_entry(pgtable, slot, &paddr, NULL, NULL); lvaddr_t pt = local_phys_to_mem(gen_phys_to_local_phys(get_address(pgtable))); // get virtual address of first page // TODO: error checking genvaddr_t vaddr; bool tlb_flush_necessary = true; struct cte *leaf_pt = cte_for_cap(pgtable); err = compile_vaddr(leaf_pt, slot, &vaddr); if (err_is_fail(err)) { if (err_no(err) == SYS_ERR_VNODE_NOT_INSTALLED) { debug(SUBSYS_PAGING, "couldn't reconstruct virtual address\n"); } else if (err_no(err) == SYS_ERR_VNODE_SLOT_INVALID && leaf_pt->mapping_info.pte == 0) { debug(SUBSYS_PAGING, "unmapping in floating page table; not flushing TLB\n"); tlb_flush_necessary = false; } else { return err; } } if (num_pages != mapping->mapping_info.pte_count) { // want to unmap a different amount of pages than was mapped return SYS_ERR_VM_MAP_SIZE; } do_unmap(pt, slot, num_pages); // flush TLB for unmapped pages if we got a valid virtual address // TODO: heuristic that decides if selective or full flush is more // efficient? if (tlb_flush_necessary) { if (num_pages > 1 || err_is_fail(err)) { do_full_tlb_flush(); } else { do_one_tlb_flush(vaddr); } } // update mapping info memset(&mapping->mapping_info, 0, sizeof(struct mapping_info)); return SYS_ERR_OK; }
errval_t page_mappings_unmap(struct capability *pgtable, struct cte *mapping) { assert(type_is_vnode(pgtable->type)); assert(type_is_mapping(mapping->cap.type)); struct Frame_Mapping *info = &mapping->cap.u.frame_mapping; errval_t err; debug(SUBSYS_PAGING, "page_mappings_unmap(%hu pages)\n", info->pte_count); // calculate page table address lvaddr_t pt = local_phys_to_mem(gen_phys_to_local_phys(get_address(pgtable))); cslot_t slot = info->entry; // get virtual address of first page genvaddr_t vaddr; bool tlb_flush_necessary = true; struct cte *leaf_pt = cte_for_cap(pgtable); err = compile_vaddr(leaf_pt, slot, &vaddr); if (err_is_fail(err)) { if (err_no(err) == SYS_ERR_VNODE_NOT_INSTALLED && vaddr == 0) { debug(SUBSYS_PAGING, "unmapping in floating page table; not flushing TLB\n"); tlb_flush_necessary = false; } else if (err_no(err) == SYS_ERR_VNODE_SLOT_INVALID) { debug(SUBSYS_PAGING, "couldn't reconstruct virtual address\n"); } else { return err; } } do_unmap(pt, slot, info->pte_count); // flush TLB for unmapped pages if we got a valid virtual address // TODO: heuristic that decides if selective or full flush is more // efficient? if (tlb_flush_necessary) { if (info->pte_count > 1 || err_is_fail(err)) { do_full_tlb_flush(); } else { do_one_tlb_flush(vaddr); } } return SYS_ERR_OK; }
char * sthread_t::set_bufsize(size_t size) { w_rc_t e; char *start; if(size==0) { do_unmap(); return NULL; } e = set_bufsize(size, start); if (e.is_error()) { std::cerr << "Hidden Failure: set_bufsize(" << size << "):" << std::endl << e << std::endl; return 0; } /* compatability on free */ if (size == 0) start = 0; return start; }
int steal_pty(pid_t pid, int *pty) { int err = 0; struct steal_pty_state steal = {}; long page_size = sysconf(_SC_PAGE_SIZE); if ((err = preflight_check(pid))) goto out; if ((err = get_terminal_state(&steal, pid))) goto out; if ((err = setup_steal_socket(&steal))) goto out; debug("Listening on socket: %s", steal.addr_un.sun_path); debug("Attaching terminal emulator pid=%d", steal.emulator_pid); if ((err = grab_pid(steal.emulator_pid, &steal.child, &steal.child_scratch))) goto out; debug("Attached to terminal emulator (pid %d)", (int)steal.emulator_pid); if ((err = find_master_fd(&steal))) { error("Unable to find the fd for the pty!"); goto out; } if ((err = setup_steal_socket_child(&steal))) goto out; if ((err = steal_child_pty(&steal))) goto out; if ((err = steal_block_hup(&steal))) goto out; if ((err = steal_cleanup_child(&steal))) goto out; goto out_no_child; out: if (steal.ptyfd) { close(steal.ptyfd); steal.ptyfd = 0; } if (steal.child_fd > 0) do_syscall(&steal.child, close, steal.child_fd, 0, 0, 0, 0, 0); if (steal.child_scratch > 0) do_unmap(&steal.child, steal.child_scratch, page_size); if (steal.child.state != ptrace_detached) { ptrace_restore_regs(&steal.child); ptrace_detach_child(&steal.child); } out_no_child: if (steal.sockfd > 0) { close(steal.sockfd); unlink(steal.addr_un.sun_path); } if (steal.tmpdir[0]) { rmdir(steal.tmpdir); } if (steal.ptyfd) *pty = steal.ptyfd; free(steal.master_fds.fds); return err; }
int attach_child(pid_t pid, const char *pty, int force_stdio) { struct ptrace_child child; child_addr_t scratch_page = -1; int *child_tty_fds = NULL, n_fds, child_fd, statfd = -1; int i; int err = 0; long page_size = sysconf(_SC_PAGE_SIZE); #ifdef __linux__ char stat_path[PATH_MAX]; #endif if ((err = check_pgroup(pid))) { return err; } if ((err = preflight_check(pid))) { return err; } debug("Using tty: %s", pty); if ((err = copy_tty_state(pid, pty))) { if (err == ENOTTY && !force_stdio) { error("Target is not connected to a terminal.\n" " Use -s to force attaching anyways."); return err; } } #ifdef __linux__ snprintf(stat_path, sizeof stat_path, "/proc/%d/stat", pid); statfd = open(stat_path, O_RDONLY); if (statfd < 0) { error("Unable to open %s: %s", stat_path, strerror(errno)); return -statfd; } #endif kill(pid, SIGTSTP); wait_for_stop(pid, statfd); if ((err = grab_pid(pid, &child, &scratch_page))) { goto out_cont; } if (force_stdio) { child_tty_fds = malloc(3 * sizeof(int)); if (!child_tty_fds) { err = ENOMEM; goto out_unmap; } n_fds = 3; child_tty_fds[0] = 0; child_tty_fds[1] = 1; child_tty_fds[2] = 2; } else { child_tty_fds = get_child_tty_fds(&child, statfd, &n_fds); if (!child_tty_fds) { err = child.error; goto out_unmap; } } if (ptrace_memcpy_to_child(&child, scratch_page, pty, strlen(pty) + 1)) { err = child.error; error("Unable to memcpy the pty path to child."); goto out_free_fds; } child_fd = do_syscall(&child, openat, -1, scratch_page, O_RDWR | O_NOCTTY, 0, 0, 0); if (child_fd < 0) { err = child_fd; error("Unable to open the tty in the child."); goto out_free_fds; } debug("Opened the new tty in the child: %d", child_fd); err = ignore_hup(&child, scratch_page); if (err < 0) goto out_close; err = do_syscall(&child, getsid, 0, 0, 0, 0, 0, 0); if (err != child.pid) { debug("Target is not a session leader, attempting to setsid."); err = do_setsid(&child); } else { do_syscall(&child, ioctl, child_tty_fds[0], TIOCNOTTY, 0, 0, 0, 0); } if (err < 0) goto out_close; err = do_syscall(&child, ioctl, child_fd, TIOCSCTTY, 1, 0, 0, 0); if (err != 0) { /* Seems to be returning >0 for error */ error("Unable to set controlling terminal: %s", strerror(err)); goto out_close; } debug("Set the controlling tty"); for (i = 0; i < n_fds; i++) { err = do_dup2(&child, child_fd, child_tty_fds[i]); if (err < 0) error("Problem moving child fd number %d to new tty: %s", child_tty_fds[i], strerror(errno)); } err = 0; out_close: do_syscall(&child, close, child_fd, 0, 0, 0, 0, 0); out_free_fds: free(child_tty_fds); out_unmap: do_unmap(&child, scratch_page, page_size); ptrace_restore_regs(&child); ptrace_detach_child(&child); if (err == 0) { kill(child.pid, SIGSTOP); wait_for_stop(child.pid, statfd); } kill(child.pid, SIGWINCH); out_cont: kill(child.pid, SIGCONT); #ifdef __linux__ close(statfd); #endif return err < 0 ? -err : err; }
int attach_child(pid_t pid, const char *pty, int force_stdio) { struct ptrace_child child; unsigned long scratch_page = -1; int *child_tty_fds = NULL, n_fds, child_fd, statfd; int i; int err = 0; long page_size = sysconf(_SC_PAGE_SIZE); char stat_path[PATH_MAX]; long mmap_syscall; if ((err = copy_tty_state(pid, pty))) { if (err == ENOTTY && !force_stdio) { error("Target is not connected to a terminal.\n" " Use -s to force attaching anyways."); return err; } } snprintf(stat_path, sizeof stat_path, "/proc/%d/stat", pid); statfd = open(stat_path, O_RDONLY); if (statfd < 0) { error("Unable to open %s: %s", stat_path, strerror(errno)); return -statfd; } kill(pid, SIGTSTP); wait_for_stop(pid, statfd); if (ptrace_attach_child(&child, pid)) { err = child.error; goto out_cont; } if (ptrace_advance_to_state(&child, ptrace_at_syscall)) { err = child.error; goto out_detach; } if (ptrace_save_regs(&child)) { err = child.error; goto out_detach; } mmap_syscall = ptrace_syscall_numbers(&child)->nr_mmap2; if (mmap_syscall == -1) mmap_syscall = ptrace_syscall_numbers(&child)->nr_mmap; scratch_page = ptrace_remote_syscall(&child, mmap_syscall, 0, page_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); if (scratch_page > (unsigned long)-1000) { err = -(signed long)scratch_page; goto out_unmap; } debug("Allocated scratch page: %lx", scratch_page); if (force_stdio) { child_tty_fds = malloc(3 * sizeof(int)); if (!child_tty_fds) { err = ENOMEM; goto out_unmap; } n_fds = 3; child_tty_fds[0] = 0; child_tty_fds[1] = 1; child_tty_fds[2] = 2; } else { child_tty_fds = get_child_tty_fds(&child, statfd, &n_fds); if (!child_tty_fds) { err = child.error; goto out_unmap; } } if (ptrace_memcpy_to_child(&child, scratch_page, pty, strlen(pty)+1)) { err = child.error; error("Unable to memcpy the pty path to child."); goto out_free_fds; } child_fd = do_syscall(&child, open, scratch_page, O_RDWR|O_NOCTTY, 0, 0, 0, 0); if (child_fd < 0) { err = child_fd; error("Unable to open the tty in the child."); goto out_free_fds; } debug("Opened the new tty in the child: %d", child_fd); err = ignore_hup(&child, scratch_page); if (err < 0) goto out_close; err = do_syscall(&child, getsid, 0, 0, 0, 0, 0, 0); if (err != child.pid) { debug("Target is not a session leader, attempting to setsid."); err = do_setsid(&child); } else { do_syscall(&child, ioctl, child_tty_fds[0], TIOCNOTTY, 0, 0, 0, 0); } if (err < 0) goto out_close; err = do_syscall(&child, ioctl, child_fd, TIOCSCTTY, 0, 0, 0, 0); if (err < 0) { error("Unable to set controlling terminal."); goto out_close; } debug("Set the controlling tty"); for (i = 0; i < n_fds; i++) do_syscall(&child, dup2, child_fd, child_tty_fds[i], 0, 0, 0, 0); err = 0; out_close: do_syscall(&child, close, child_fd, 0, 0, 0, 0, 0); out_free_fds: free(child_tty_fds); out_unmap: do_unmap(&child, scratch_page, page_size); ptrace_restore_regs(&child); out_detach: ptrace_detach_child(&child); if (err == 0) { kill(child.pid, SIGSTOP); wait_for_stop(child.pid, statfd); } kill(child.pid, SIGWINCH); out_cont: kill(child.pid, SIGCONT); close(statfd); return err < 0 ? -err : err; }
errval_t unmap_capability(struct cte *mem) { errval_t err; TRACE_CAP_MSG("unmapping", mem); genvaddr_t vaddr = 0; bool single_page_flush = false; int mapping_count = 0, unmap_count = 0; genpaddr_t faddr = get_address(&mem->cap); // iterate over all mappings associated with 'mem' and unmap them struct cte *next = mem; struct cte *to_delete = NULL; while ((next = mdb_successor(next)) && get_address(&next->cap) == faddr) { TRACE_CAP_MSG("looking at", next); if (next->cap.type == get_mapping_type(mem->cap.type) && next->cap.u.frame_mapping.cap == &mem->cap) { TRACE_CAP_MSG("cleaning up mapping", next); mapping_count ++; // do unmap struct Frame_Mapping *mapping = &next->cap.u.frame_mapping; struct cte *pgtable = mapping->ptable; if (!pgtable) { debug(SUBSYS_PAGING, "mapping->ptable == 0: just deleting mapping\n"); // mem is not mapped, so just return goto delete_mapping; } if (!type_is_vnode(pgtable->cap.type)) { debug(SUBSYS_PAGING, "mapping->ptable.type not vnode (%d): just deleting mapping\n", mapping->ptable->cap.type); // mem is not mapped, so just return goto delete_mapping; } lpaddr_t ptable_lp = gen_phys_to_local_phys(get_address(&pgtable->cap)); lvaddr_t ptable_lv = local_phys_to_mem(ptable_lp); cslot_t slot = mapping->entry; // unmap do_unmap(ptable_lv, slot, mapping->pte_count); unmap_count ++; // TLB flush? if (unmap_count == 1) { err = compile_vaddr(pgtable, slot, &vaddr); if (err_is_ok(err) && mapping->pte_count == 1) { single_page_flush = true; } } delete_mapping: assert(!next->delete_node.next); // mark mapping cap for delete: cannot do delete here as it messes // up mdb_successor() next->delete_node.next = to_delete; to_delete = next; } } // delete mapping caps while (to_delete) { next = to_delete->delete_node.next; err = caps_delete(to_delete); if (err_is_fail(err)) { printk(LOG_NOTE, "caps_delete: %"PRIuERRV"\n", err); } to_delete = next; } TRACE_CAP_MSGF(mem, "unmapped %d/%d instances", unmap_count, mapping_count); // do TLB flush if (single_page_flush) { do_one_tlb_flush(vaddr); } else { do_full_tlb_flush(); } return SYS_ERR_OK; }
w_rc_t sthread_t::set_bufsize_normal( size_t size, char *&buf_start /* in/out*/, long system_page_size) { size_t requested_size = size; // save for asserts later // *********************************************************** // // GET PAGE SIZES // // If the SM pagesize is larger than the largest system page size, // align everything on the former (safe and is less confusing). // // *********************************************************** long max_page_size = get_max_page_size(system_page_size); w_assert1(system_page_size <= max_page_size); long align_page_size = (SM_PAGESIZE > max_page_size)? SM_PAGESIZE : max_page_size; // *********************************************************** // // GET FILE DESCRIPTOR FOR MMAP // // *********************************************************** int fd(-1); // must be -1 if not mapping to a file // *********************************************************** // // GET FLAGS FOR MMAP // // If posix mmapped file are available, _POSIX_MAPPED_FILES is defined // in <unistd.h> to be > 0 // // That should give you these flags: // MAP_FIXED, MAP_PRIVATE, MAP_NORESERVE, MAP_ANONYMOUS // If MAP_ANONYMOUS is not there, MAP_ANON might be. // // However... systems aren't exactly in sync here, so configure.ac // checks for each of these flags. // // *********************************************************** int flags1 = MAP_PRIVATE; size_t extra_align = align_page_size; size_t align_arg = 0; #if HAVE_DECL_MAP_ANONYMOUS==1 flags1 |= MAP_ANONYMOUS; #elif HAVE_DECL_MAP_ANON==1 flags1 |= MAP_ANON; #else #endif #if HAVE_DECL_MAP_NORESERVE==1 flags1 |= MAP_NORESERVE; #endif #if HAVE_DECL_MAP_ALIGN==1 flags1 |= MAP_ALIGN; extra_align = 0; align_arg = align_page_size; #endif // add the extra alignment to the size requested before alignment, // and then do our own alignment at the end In the case of // MAP_ALIGN this is unnecessary, and the extra alignment is zero. size += extra_align; align_bufsize(size, system_page_size, align_page_size); // *********************************************************** // // FIRST MMAP: get a mapped region from the kernel. // If we are using hugetlbfs, fd will be >= 0 and // we won't have to do the remap -- the first mapping will // give us the best page sizes we can get. In that case, // skip the first mmap and do exactly one "second mmap" // // *********************************************************** errno = 0; _disk_buffer = (char*) mmap((char*)align_arg, _disk_buffer_size, PROT_NONE, flags1, fd, /* fd */ 0 /* off_t */ ); if (_disk_buffer == MAP_FAILED) { std::cerr << __LINE__ << " " << "mmap (size=" << _disk_buffer_size << " = " << int(_disk_buffer_size/1024) << " KB ) returns " << long(_disk_buffer) << " errno is " << errno << " " << strerror(errno) << " flags " << flags1 << " fd " << fd << std::endl; return RC(fcMMAPFAILED); } #if W_DEBUG_LEVEL > 4 else { std::cerr << __LINE__ << " " << "mmap SUCCESS! (size=" << _disk_buffer_size << " = " << int(_disk_buffer_size/1024) << " KB ) returns " << long(_disk_buffer) << " errno is " << errno << " " << strerror(errno) << " flags " << flags1 << " fd " << fd << std::endl; } #endif // *********************************************************** // // RE-MMAP: manually align the region and give the useful part R/W // permissions. // // *********************************************************** _disk_buffer = (char*)alignon(_disk_buffer, align_page_size); alignon(requested_size, system_page_size); if (mprotect(_disk_buffer, requested_size, PROT_READ|PROT_WRITE)) { std::cerr << __LINE__ << " " << "mprotect (addr=" << long(_disk_buffer) << ", size=" << requested_size << ") returns -1;" << " errno is " << errno << " " << strerror(errno) << std::endl; do_unmap(); return RC(fcMMAPFAILED); } #ifdef HAVE_MEMCNTL struct memcntl_mha info; info.mha_cmd = MHA_MAPSIZE_VA; info.mha_flags = 0; info.mha_pagesize = max_page_size; // Ask the kernel to use the max page size here if(memcntl(_disk_buffer, requested_size, MC_HAT_ADVISE, (char *)&info, 0, 0) < 0) { std::cerr << "memcntl returns -1;" << " errno is " << errno << " " << strerror(errno) << " requested size " << max_page_size << std::endl; } #endif align_for_sm(requested_size); buf_start = _disk_buffer; clear(buf_start, requested_size); return RCOK; }